2024/2025
Authors: Daniel Gutierrez & Fabio Pimentel
# Required Packages--------------------
rm(list = ls())
library(readxl)
library(ggplot2)
library(GGally)
## Registered S3 method overwritten by 'GGally':
## method from
## +.gg ggplot2
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(lubridate)
##
## Attaching package: 'lubridate'
## The following objects are masked from 'package:base':
##
## date, intersect, setdiff, union
library(corrplot)
## corrplot 0.95 loaded
library(feasts)
## Loading required package: fabletools
## Registered S3 method overwritten by 'tsibble':
## method from
## as_tibble.grouped_df dplyr
library(tsibble)
##
## Attaching package: 'tsibble'
## The following object is masked from 'package:lubridate':
##
## interval
## The following objects are masked from 'package:base':
##
## intersect, setdiff, union
library(forecast)
## Registered S3 method overwritten by 'quantmod':
## method from
## as.zoo.data.frame zoo
library(tidyr)
library(ggthemes)
library(car)
## Loading required package: carData
##
## Attaching package: 'car'
## The following object is masked from 'package:dplyr':
##
## recode
library(DIMORA)
## Loading required package: minpack.lm
## Loading required package: numDeriv
## Loading required package: reshape2
##
## Attaching package: 'reshape2'
## The following object is masked from 'package:tidyr':
##
## smiths
## Loading required package: deSolve
library(tseries)
library(lmtest)
## Loading required package: zoo
##
## Attaching package: 'zoo'
## The following object is masked from 'package:tsibble':
##
## index
## The following objects are masked from 'package:base':
##
## as.Date, as.Date.numeric
library(prophet)
## Loading required package: Rcpp
## Loading required package: rlang
#setwd('/Users/fabiopimentel/Documents/Padua/clases/segundo año primer semestre/BEF data/proyecto/time_series_padova-main')
# target variable
sales <- read_excel("data/sales/sales_dimsum_31102024.xlsx")
sales[is.na(sales)] <- 0 # set to zero na values
# economic variables
eco_growth <- read_excel("data/macroeconomic/economic_activity.xlsx")
fx <- read_excel("data/macroeconomic/fx.xlsx") #Foreign exchange is the conversion of one currency into another
inflation <- read_excel("data/macroeconomic/inflation.xlsx")
unemployment <- read_excel("data/macroeconomic/unemployment.xlsx")
# other variables
google_trends <- read_excel("data/other/google_trends_restaurantes.xlsx")
rain <- read_excel("data/other/rain_proxy.xlsx")
temp <- read_excel("data/other/temperature_data.xlsx")
temp[is.na(temp)] <- 0
rain[is.na(rain)] <- 0
plot(temp$tavg) # no zeros in temp : OK
plot(temp$tmedian) # no zeros in temp : OK- looks better than mean
str(sales)
## tibble [1,095 × 4] (S3: tbl_df/tbl/data.frame)
## $ date : POSIXct[1:1095], format: "2021-11-01" "2021-11-02" ...
## $ sales_cop: num [1:1095] 0 0 0 0 0 0 0 0 0 0 ...
## $ bar : num [1:1095] 0 0 0 0 0 0 0 0 0 0 ...
## $ food : num [1:1095] 0 0 0 0 0 0 0 0 0 0 ...
str(eco_growth)
## tibble [238 × 2] (S3: tbl_df/tbl/data.frame)
## $ date : POSIXct[1:238], format: "2005-01-01" "2005-02-01" ...
## $ ise_original: num [1:238] 59.8 61.5 62.2 63 62.7 ...
str(fx) # Foreign exchange is the conversion of one currency into another
## tibble [12,020 × 2] (S3: tbl_df/tbl/data.frame)
## $ date: POSIXct[1:12020], format: "1991-11-27" "1991-11-28" ...
## $ fx : num [1:12020] 693 694 695 695 643 ...
str(inflation)
## tibble [832 × 2] (S3: tbl_df/tbl/data.frame)
## $ date : POSIXct[1:832], format: "1955-07-01" "1955-08-01" ...
## $ inflation: num [1:832] -0.0087 -0.0001 0.0084 0.0077 0.0144 0.0203 0.021 0.0257 0.031 0.0304 ...
str(unemployment)
## tibble [286 × 2] (S3: tbl_df/tbl/data.frame)
## $ date : POSIXct[1:286], format: "2001-01-01" "2001-02-01" ...
## $ unemployment: num [1:286] 16.6 17.4 15.8 14.5 14 ...
str(google_trends)
## tibble [262 × 2] (S3: tbl_df/tbl/data.frame)
## $ date : POSIXct[1:262], format: "2019-11-10" "2019-11-17" ...
## $ google_trends: num [1:262] 64 60 63 65 69 63 80 98 75 63 ...
str(rain)
## tibble [56,648 × 6] (S3: tbl_df/tbl/data.frame)
## $ date : POSIXct[1:56648], format: "2021-01-01" "2021-01-01" ...
## $ region : chr [1:56648] "ANTIOQUIA" "ANTIOQUIA" "ANTIOQUIA" "ANTIOQUIA" ...
## $ river : chr [1:56648] "A. SAN LORENZO" "CARLOS LLERAS" "CONCEPCION" "DESV. EEPPM (NEC,PAJ,DOL)" ...
## $ contribution_m3s : num [1:56648] 19.29 20.31 3.61 7.48 19.5 ...
## $ contrribution_kWh: num [1:56648] 4354800 3143300 1011700 2096000 899700 ...
## $ contribution_pct : num [1:56648] 0.798 0.79 0.712 0.944 0.638 ...
str(temp) # this has NaNs, must fill somehow
## tibble [1,113 × 4] (S3: tbl_df/tbl/data.frame)
## $ date : POSIXct[1:1113], format: "2021-10-31" "2021-11-01" ...
## $ tavg : num [1:1113] 19.8 18.3 18.4 18.3 18.6 19.8 19.9 19.7 18.8 18.7 ...
## $ prcp : num [1:1113] 0 0 0 0 0 0 0 0 0 0 ...
## $ tmedian: num [1:1113] 19.8 18.3 18.4 18.3 18.6 19.8 19.9 19.7 18.8 18.7 ...
# create time variables
plot(sales$sales_cop)
# sales
## sales monthly
df_sales_m <- sales %>%
mutate(month = floor_date(date, "month")) %>% # Extract month
group_by(month) %>%
summarise(sales_m = sum(sales_cop), bar_m = sum(bar), food_m = sum(food)
) # Summing values
head(df_sales_m)
## sales weekly
df_sales_w <- sales %>%
mutate(week = floor_date(date, "week")) %>% # Extract month
group_by(week) %>%
summarise(sales_w = sum(sales_cop), bar_w = sum(bar), food_w = sum(food)) # Summing values
head(df_sales_w)
# fx
df_fx_m <- fx %>%
mutate(month = floor_date(date, "month")) %>%
group_by(month) %>%
summarise(fx_m = mean(fx))
df_fx_w <- fx %>%
mutate(week = floor_date(date, "week")) %>%
group_by(week) %>%
summarise(fx_w = mean(fx))
head(df_fx_m)
head(df_fx_w)
# google trends
# montly
df_google_m <- google_trends %>%
mutate(month = floor_date(date, "month")) %>%
group_by(month) %>%
summarise(google_m = mean(google_trends))
# weekly
df_google_w <- google_trends %>%
mutate(week = floor_date(date, "week")) %>%
group_by(week) %>%
summarise(google_w = mean(google_trends))
head(df_google_m)
head(df_google_w)
## rain
df_rain_g = rain %>%
group_by(date, region) %>%
summarise(rain_sum=sum(contribution_m3s))
## `summarise()` has grouped output by 'date'. You can override using the
## `.groups` argument.
df_rain_g <- df_rain_g[df_rain_g$region=="ANTIOQUIA",]
head(df_rain_g)
# montly
df_rain_m <- df_rain_g %>%
mutate(month = floor_date(date, "month")) %>%
group_by(month) %>%
summarise(rain_m = sum(rain_sum))
# weekly
df_rain_w <- df_rain_g %>%
mutate(week = floor_date(date, "week")) %>%
group_by(week) %>%
summarise(rain_w = sum(rain_sum))
head(df_rain_m)
head(df_rain_w)
# temperature
# montly
df_temp_m <- temp %>%
mutate(month = floor_date(date, "month")) %>%
group_by(month) %>%
summarise(temp_m = mean(tavg), prcp_m = sum(prcp))
# weekly
df_temp_w <- temp %>%
mutate(week = floor_date(date, "week")) %>%
group_by(week) %>%
summarise(temp_w = mean(tavg), prcp_w = sum(prcp))
head(df_temp_m)
head(df_temp_w)
## daily data----------
#sales, rain, fx are the only ones daily
df_merged_d <- merge(sales, df_rain_g, by = "date", all = FALSE) # Inner join
df_merged_d <- merge(df_merged_d, fx, by = "date", all = FALSE) # Inner join
df_merged_d <- merge(df_merged_d, temp, by = "date", all = FALSE) # Inner join
head(df_merged_d)
### weekly data----------
df_merged_w <- merge(df_sales_w, df_rain_w, by="week", all=F)
df_merged_w <- merge(df_merged_w, df_google_w, by="week", all=F)
df_merged_w <- merge(df_merged_w, df_fx_w, by="week", all=F)
df_merged_w <- merge(df_merged_w, df_temp_w, by="week", all=F)
head(df_merged_w)
### monthly data----------
# change colnames
names(eco_growth) <- c("month", "ise")
names(inflation) <- c("month", "inflation")
names(unemployment) <- c("month", "unemployment")
df_merged_m <- merge(df_sales_m, df_rain_m, by="month", all=F)
nrow(df_merged_m)
## [1] 36
df_merged_m <- merge(df_merged_m, df_fx_m, by="month", all=F)
nrow(df_merged_m)
## [1] 36
df_merged_m <- merge(df_merged_m, df_google_m, by="month", all=F)
nrow(df_merged_m)
## [1] 36
df_merged_m <- merge(df_merged_m, eco_growth, by="month", all=F) # only has until aug 2024
nrow(df_merged_m)
## [1] 36
df_merged_m <- merge(df_merged_m, inflation, by="month", all=F)
nrow(df_merged_m)
## [1] 36
df_merged_m <- merge(df_merged_m, unemployment, by="month", all=F)
nrow(df_merged_m)
## [1] 36
df_merged_m <- merge(df_merged_m, df_temp_m, by="month", all=F)
nrow(df_merged_m)
## [1] 36
objects_to_keep <- c("df_merged_d", "df_merged_w", "df_merged_m")
# Remove all objects except those specified
rm(list = setdiff(ls(), objects_to_keep))
# sales daily
ggplot(
df_merged_d,
aes(x=date, y=sales_cop)
) + geom_line() + ggtitle("Daily Sales of Restaurant")
# sales weekly
ggplot(df_merged_w, aes(x=week, y=sales_w)) +
geom_line() + ggtitle("Weekly Sales of Restaurant")
# sales montly
ggplot(df_merged_m, aes(x=month, y=sales_m)) +
geom_line() + ggtitle("Monthly Sales of Restaurant")
We want to move to a stacked bar chart when we care about the relative decomposition of each primary bar based on the levels of a second categorical variable. Each bar is now comprised of a number of sub-bars, each one corresponding with a level of a secondary categorical variable. The total length of each stacked bar is the same as before, but now we can see how the secondary groups contributed to that total.
One important consideration in building a stacked bar chart is to decide which of the two categorical variables will be the primary variable (dictating major axis positions and overall bar lengths) and which will be the secondary (dictating how each primary bar will be subdivided). The most ‘important’ variable should be the primary; use domain knowledge and the specific type of categorical variables to make a decision on how to assign your categorical variables
#Monthly
# Reshape the data to a long format
df_sales_m_long <- df_merged_m %>%
pivot_longer(cols = c(bar_m, food_m), names_to = "Category", values_to = "Value")
# Create the stacked bar plot
ggplot(df_sales_m_long, aes(x = month, y = Value, fill = Category)) +
geom_bar(stat = "identity", position = "stack") +
ggtitle("Monthly Sales of Restaurant") +
labs(y = "Sales", x = "Month", fill = "Category") +
theme_minimal()
# Weekly
# Reshape the data to a long format
df_sales_w_long <- df_merged_w %>%
pivot_longer(cols = c(bar_w, food_w), names_to = "Category", values_to = "Value")
# Create the stacked bar plot
ggplot(df_sales_w_long, aes(x = week, y = Value, fill = Category)) +
geom_bar(stat = "identity", position = "stack") +
ggtitle("Weekly Sales of Restaurant") +
labs(y = "Sales", x = "Week", fill = "Category") +
theme_minimal()
# Seasonal plots
df_sales_w_filtered <- df_merged_w %>%
filter(week >= ymd("2021-12-31"))
tseries_w <- ts(df_sales_w_filtered$sales_w , start = c(2022, 1), frequency = 52)
head(tseries_w)
## Time Series:
## Start = c(2022, 1)
## End = c(2022, 6)
## Frequency = 52
## [1] 5654618 5894371 8308052 9164178 11934123 9777570
seasonplot(tseries_w, col = rainbow(3), year.labels = TRUE, main = "Seasonal Plot")
text(x = 1, y = max(tseries_w) - 1.5e7, labels = "2024", col = "blue")
# seasonplot monthly
df_sales_m_filtered <- df_merged_m %>%
filter(month >= ymd("2021-12-31"))
tseries_m <- ts(df_sales_m_filtered$sales_m , start = c(2022, 1), frequency = 12)
head(tseries_m)
## Jan Feb Mar Apr May Jun
## 2022 31953282 41926179 37466677 57926246 79622843 88816435
seasonplot(tseries_m, col = rainbow(3), year.labels = TRUE, main = "Seasonal Plot")
text(x = 1, y = max(tseries_m) - 1e6, labels = "2024", col = "blue")
# Montly Density
# Select the columns of interest
variables <- c("sales_m", "bar_m", "food_m", "rain_m", "fx_m", "google_m",
"ise", "inflation", "unemployment", "temp_m", "prcp_m")
# Transform the data to long format for ggplot2
df_long_m <- df_merged_m %>%
pivot_longer(cols = all_of(variables), names_to = "Variable", values_to = "Value")
# Create the grid of density plots
ggplot(df_long_m, aes(x = Value)) +
geom_density(fill = "blue", alpha = 0.4) +
facet_wrap(~ Variable, scales = "free", ncol = 3) +
labs(title = "Density Plots of Selected Variables",
x = "Value", y = "Density") +
theme_minimal()
# Weekly Density
# Select the columns of interest
variables <- c("sales_w", "bar_w", "food_w", "rain_w", "fx_w", "google_w",
"temp_w", "prcp_w")
df_long_w <- df_merged_w %>%
pivot_longer(cols = all_of(variables), names_to = "Variable", values_to = "Value")
# Create the grid of density plots
ggplot(df_long_w, aes(x = Value)) +
geom_density(fill = "blue", alpha = 0.4) +
facet_wrap(~ Variable, scales = "free", ncol = 3) +
labs(title = "Density Plots of Selected Variables",
x = "Value", y = "Density") +
theme_minimal()
# Daily Density
# Select the columns of interest
variables <- c("sales_cop", "bar", "food", "rain_sum", "fx",
"tmedian", "prcp")
df_long_d <- df_merged_d %>%
pivot_longer(cols = all_of(variables), names_to = "Variable", values_to = "Value")
# Create the grid of density plots
ggplot(df_long_d, aes(x = Value)) +
geom_density(fill = "blue", alpha = 0.4) +
facet_wrap(~ Variable, scales = "free", ncol = 3) +
labs(title = "Density Plots of Selected Variables",
x = "Value", y = "Density") +
theme_minimal()
### 3.5.1 economic variables-----------------------
# economic growth
ggplot(df_merged_m, aes(x=month, y=ise)) +
geom_line() + ggtitle("Monthly activity in Colombia")
# clearly seasonal and trend
# fx
ggplot(df_merged_d, aes(x=date, y=fx)) +
geom_line() + ggtitle("Daily COP/USD")
# trend but no clear seasonality
# inflation
ggplot(df_merged_m, aes(x=month, y=inflation)) +
geom_line() + ggtitle("Monthly inflation National")
# business cycles, no tend or seasonality
# unemployment
ggplot(df_merged_m, aes(x=month, y=unemployment)) +
geom_line() + ggtitle("Montly trailing unemployment Medellin")
# seasonal and trend downwards
### 3.5.2 Other variables
# google trends
ggplot(df_merged_w, aes(x=week, y=google_w)) +
geom_line() + ggtitle("Weelkly Google trends 'Restaurantes'")
# no clear behaviour, drop in pandemic
# rain
ggplot(df_merged_d, aes(x=date, y=rain_sum)) +
geom_line() + ggtitle("Daily rain approximated in Antioquia")
# no trend or seasonality clearly
# temperature
ggplot(df_merged_d, aes(x=date, y=tmedian)) +
geom_line() + ggtitle("Daily Median temperature in Medellin")
# almost stationary
# temperature
ggplot(df_merged_d, aes(x=date, y=tavg)) +
geom_line() + ggtitle("Daily Average temperature in Medellin")
# this one looks weird, better keep working on median
# precipitation from temp
ggplot(df_merged_d, aes(x=date, y=prcp)) +
geom_line() + ggtitle("Daily precipitation in Medellin")
# looks decent
df_merged_d <- subset(df_merged_d, select = -region)
# daily
ggpairs(df_merged_d,
columns = 2:8)
# sales have correl with fx and rain_sum
# weekly
ggpairs(df_merged_w,
columns = 2:9)
# sales have correl with rain, google, fx, temp
# bar has more correl with temp
# montly
ggpairs(df_merged_m,
columns = 2:12)
# Exclude 'date' column
numeric_df_d <- df_merged_d[, sapply(df_merged_d, is.numeric)]
cor_matrix_d <- cor(numeric_df_d, use = "complete.obs") # Use only complete rows
cor_matrix_d
## sales_cop bar food rain_sum fx
## sales_cop 1.000000000 0.895118281 0.98135791 0.1662524 0.14677410
## bar 0.895118281 1.000000000 0.79301917 0.1275670 0.13671488
## food 0.981357913 0.793019172 1.00000000 0.1716482 0.14261504
## rain_sum 0.166252448 0.127567019 0.17164822 1.0000000 0.63500919
## fx 0.146774104 0.136714885 0.14261504 0.6350092 1.00000000
## tavg 0.008884659 0.089640797 -0.02562481 -0.1340051 0.16455022
## prcp 0.028085504 -0.001337552 0.03913167 0.1857676 0.05556529
## tmedian 0.108891610 0.111686146 0.10120437 -0.2717930 -0.07710937
## tavg prcp tmedian
## sales_cop 0.008884659 0.028085504 0.10889161
## bar 0.089640797 -0.001337552 0.11168615
## food -0.025624811 0.039131672 0.10120437
## rain_sum -0.134005147 0.185767588 -0.27179302
## fx 0.164550220 0.055565287 -0.07710937
## tavg 1.000000000 -0.267367632 0.65617098
## prcp -0.267367632 1.000000000 -0.22103872
## tmedian 0.656170981 -0.221038724 1.00000000
numeric_df_w <- df_merged_w[, sapply(df_merged_w, is.numeric)]
cor_matrix_w <- cor(numeric_df_w, use = "complete.obs") # Use only complete rows
cor_matrix_w
## sales_w bar_w food_w rain_w google_w
## sales_w 1.00000000 0.8910033 0.98922211 0.2913161 -0.444675519
## bar_w 0.89100331 1.0000000 0.81506933 0.2875652 -0.355177208
## food_w 0.98922211 0.8150693 1.00000000 0.2785842 -0.453332380
## rain_w 0.29131607 0.2875652 0.27858415 1.0000000 -0.107348280
## google_w -0.44467552 -0.3551772 -0.45333238 -0.1073483 1.000000000
## fx_w 0.24395629 0.2873958 0.22038752 0.6626647 -0.002777267
## temp_w -0.02746535 0.1275798 -0.07507291 -0.1062959 0.208497604
## prcp_w 0.17241188 0.1211778 0.18095801 0.3289820 -0.251630474
## fx_w temp_w prcp_w
## sales_w 0.243956293 -0.02746535 0.1724119
## bar_w 0.287395824 0.12757976 0.1211778
## food_w 0.220387520 -0.07507291 0.1809580
## rain_w 0.662664730 -0.10629590 0.3289820
## google_w -0.002777267 0.20849760 -0.2516305
## fx_w 1.000000000 0.19315256 0.1078585
## temp_w 0.193152564 1.00000000 -0.3047876
## prcp_w 0.107858510 -0.30478758 1.0000000
numeric_df_m <- df_merged_m[, sapply(df_merged_m, is.numeric)]
cor_matrix_m <- cor(numeric_df_m, use = "complete.obs") # Use only complete rows
cor_matrix_m
## sales_m bar_m food_m rain_m fx_m
## sales_m 1.00000000 0.91772972 0.9930535 0.345901419 0.260081195
## bar_m 0.91772972 1.00000000 0.8646613 0.357818748 0.330438807
## food_m 0.99305351 0.86466133 1.0000000 0.331685570 0.232546131
## rain_m 0.34590142 0.35781875 0.3316856 1.000000000 0.700900212
## fx_m 0.26008119 0.33043881 0.2325461 0.700900212 1.000000000
## google_m -0.60141163 -0.43583265 -0.6306910 -0.127328581 -0.016302700
## ise 0.14184921 0.09197376 0.1528346 0.143925313 0.006203153
## inflation 0.24620155 0.41865460 0.1885633 0.554477494 0.731585721
## unemployment -0.47774546 -0.35355440 -0.4999277 -0.314673431 -0.148857922
## temp_m -0.05360312 0.12366320 -0.1041492 -0.004770513 0.242855832
## prcp_m 0.28057040 0.20114717 0.2956418 0.364914544 0.106392263
## google_m ise inflation unemployment temp_m
## sales_m -0.60141163 0.141849210 0.2462015 -0.477745462 -0.053603118
## bar_m -0.43583265 0.091973761 0.4186546 -0.353554400 0.123663200
## food_m -0.63069104 0.152834568 0.1885633 -0.499927664 -0.104149167
## rain_m -0.12732858 0.143925313 0.5544775 -0.314673431 -0.004770513
## fx_m -0.01630270 0.006203153 0.7315857 -0.148857922 0.242855832
## google_m 1.00000000 0.093304022 0.1600715 0.425504221 0.339768042
## ise 0.09330402 1.000000000 -0.1118127 -0.602525241 -0.190530186
## inflation 0.16007149 -0.111812691 1.0000000 -0.130801728 0.598250278
## unemployment 0.42550422 -0.602525241 -0.1308017 1.000000000 -0.007286939
## temp_m 0.33976804 -0.190530186 0.5982503 -0.007286939 1.000000000
## prcp_m -0.37823361 0.047259121 0.0657641 -0.415992203 -0.280144464
## prcp_m
## sales_m 0.28057040
## bar_m 0.20114717
## food_m 0.29564181
## rain_m 0.36491454
## fx_m 0.10639226
## google_m -0.37823361
## ise 0.04725912
## inflation 0.06576410
## unemployment -0.41599220
## temp_m -0.28014446
## prcp_m 1.00000000
# Plot the Correlation Matrix
par(mfrow=c(1,1))
corrplot(cor_matrix_d, method = "color", type = "upper", tl.col = "black", tl.srt = 45)
corrplot(cor_matrix_w, method = "color", type = "upper", tl.col = "black", tl.srt = 45)
corrplot(cor_matrix_m, method = "color", type = "upper", tl.col = "black", tl.srt = 45)
Rain has stronger correlation than prcp, so we drop prcp to not repeat the same variable from two sources Also we drop average temperature because median temperature seems more trustworthy
# drop prcp beacuse they "are the same"
df_merged_m <- df_merged_m %>% select(-prcp_m)
df_merged_w <- df_merged_w %>% select(-prcp_w)
df_merged_d <- df_merged_d %>% select(-prcp)
# drop avg temp
df_merged_d <- df_merged_d %>% select(-tavg)
colnames(df_merged_d)
## [1] "date" "sales_cop" "bar" "food" "rain_sum" "fx"
## [7] "tmedian"
### drop everything not on use
objects_to_keep <- c("df_merged_d", "df_merged_w", "df_merged_m")
# Remove all objects except those specified
rm(list = setdiff(ls(), objects_to_keep))
POSIXct and POSIXlt Classes
Times and date-times are represented by the POSIXct or the POSIXlt class in R. The POSIXct format stores date and time in seconds with the number of seconds beginning at January 1, 1970, so a POSIXct date-time is essentially an single value on a timeline. Date-times prior to 1970, will be negative numbers. The POSIXlt class stores other date and time information in a list such as hour of day of week, month of year, etc. The starting year for POSIXlt data is 1900, so 2022 would be stored as year 122. Months also begin at 0, so January is stored as month 0 and February as month 1. For both POSIX classes, the timezone can be classified. While date-times stored as POSIXct and POSIXlt look similar, when you unclass them with the unclass() function, you can see the additional information stored within the POSIXlt data.
Date Class
Dates without time can simply be stored as a Date class in R using the as.Date() function. Both Dates and POXIC classes need to be defined based on how they formatted. When uploading time series data into R, date and date-time data is typically uploaded as a character class and must be converted to date or time class using the as.Date(), as.POSIXct() or as.POSIXlt() functions.
Monthly
# Vars for model
# Month
# Ensure the `month` column is in POSIXct format
df_merged_m$month <- as.POSIXct(df_merged_m$month)
# Create the numeric variable: an evenly increasing number
df_merged_m <- df_merged_m %>%
arrange(month) %>% # Ensure data is sorted by month
mutate(numeric_month = row_number()) # Assign an increasing number
# Create the seasonal variable: the 12 different months as a factor
df_merged_m <- df_merged_m %>%
mutate(seasonal_month = factor(format(month, "%B"), levels = month.name)) # Month names as ordered factors
Weekly
# Week
# Ensure the `week` column is in POSIXct format
df_merged_w$week <- as.POSIXct(df_merged_w$week)
# Create the numeric variable: an evenly increasing number
df_merged_w <- df_merged_w %>%
arrange(week) %>% # Ensure data is sorted by week
mutate(numeric_week = row_number()) # Assign an increasing number
# Create the seasonal variable: the 12 different months as a factor
df_merged_w <- df_merged_w %>%
mutate(seasonal_month = factor(format(week, "%B"), levels = month.name)) # Month names as ordered factors
Daily
# Day
# Ensure the `day` column is in POSIXct format
df_merged_d$date <- as.POSIXct(df_merged_d$date)
# Create the numeric variable: an evenly increasing number
df_merged_d <- df_merged_d %>%
arrange(date) %>% # Ensure data is sorted by day
mutate(numeric_day = row_number()) # Assign an increasing number
# Create the seasonal variable: the 12 different months as a factor
df_merged_d <- df_merged_d %>%
mutate(seasonal_month = factor(format(date, "%B"), levels = month.name)) # Month names as ordered factors
# Create a column indicating the day of the week
df_merged_d <- df_merged_d %>%
mutate(day_of_week = factor(weekdays(date), levels = c("Monday", "Tuesday", "Wednesday",
"Thursday", "Friday", "Saturday", "Sunday"))) # Day of the week as ordered factor
Convert sales to time series objects for the use in several models
# convert to time series
sales_d_ts <- ts(df_merged_d$sales_cop)
sales_w_ts <- ts(df_merged_w$sales_w)
sales_m_ts <- ts(df_merged_m$sales_m)
par(mfrow=c(1,1))
# Daily
tsdisplay(sales_d_ts)
# is not stationary but has no clear trend
# and seasonality every 7 days
# Weekly
tsdisplay(sales_w_ts)
# not stationary: has trend
# Montly
tsdisplay(sales_m_ts)
# has clear trend, no seasonality
Some variables are scaled to log, so we can interpret the linear models more easily. The covariates are in different scales so it is easier to interpret percentage changes instead of unit changes.
# Monthly
df_merged_m <- df_merged_m %>%
mutate(across(where(is.numeric) & !all_of(c("unemployment", "inflation")), ~ log(. + 1)))
# Weekly
df_merged_w <- df_merged_w %>%
mutate(across(where(is.numeric), ~ log(. + 1)))
# Daily
# Weekly
df_merged_d <- df_merged_d %>%
mutate(across(where(is.numeric), ~ log(. + 1)))
#par(mfrow=c(1,1))
#tsdisplay(sales_d_ts)
# is not stationary but has no clear trend
plot(sales_d_ts)
acf(sales_d_ts)
pacf(sales_d_ts)
When data are seasonal, the autocorrelation will be larger for the seasonal lags (at multiples of the seasonal period) than for other lags.
# Weekly
#tsdisplay(sales_w_ts)
plot(sales_w_ts)
acf(sales_w_ts)
pacf(sales_w_ts)
# not stationary: has trend and seasonality maybe
# Montly
#tsdisplay(sales_m_ts)
plot(sales_m_ts)
acf(sales_m_ts)
pacf(sales_m_ts)
# has clear trend, no seasonality
In this section we model the time series using various approaches to find the best model for our data. We use both linear and non linear models going from the simplest to the more “complex” models.
Functions that help us implement and analyze models faster
## Function to create and summarize models------------------
run_model <- function(formula, data, model_name) {
cat("\nRunning", model_name, "\n")
model <- lm(formula, data = data)
print(summary(model))
par(mfrow = c(2, 2))
plot(model)
return(model)
}
# Function to compare models using ANOVA
compare_models <- function(model1, model2, name1, name2) {
cat("\nComparing Models:", name1, "vs", name2, "\n")
anova_result <- anova(model1, model2)
print(anova_result)
return(anova_result)
}
# Function to add predictions to the dataset
add_predictions <- function(model, data, pred_column) {
data[[pred_column]] <- predict(model, newdata = data)
return(data)
}
# Calculate RMSE
# Function to calculate RMSE
calculate_rmse <- function(observed, predicted) {
rmse <- sqrt(mean((observed - predicted)^2, na.rm = TRUE))
return(rmse)
}
# function that compares linear models
# Define the function to get R^2 and AIC
get_model_stats <- function(models) {
# Initialize an empty data frame
stats <- data.frame(
Model = character(),
R2 = numeric(),
AIC = numeric(),
stringsAsFactors = FALSE
)
# Loop through the list of models
for (i in seq_along(models)) {
model <- models[[i]]
model_name <- names(models)[i]
# Extract R^2 and AIC
r2 <- summary(model)$r.squared
aic <- AIC(model)
# Append to the data frame
stats <- rbind(stats, data.frame(Model = model_name, R2 = r2, AIC = aic))
}
return(stats)
}
# Montly Models
# View Dataframe
head(df_merged_m)
# Model 0: Trend only
ols0 <- run_model(sales_m ~ numeric_month, df_merged_m, "Model 0")
##
## Running Model 0
##
## Call:
## lm(formula = formula, data = data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.9910 -0.1192 0.0158 0.1476 0.4953
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 16.40311 0.18031 90.97 < 2e-16 ***
## numeric_month 0.68309 0.06312 10.82 1.49e-12 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.28 on 34 degrees of freedom
## Multiple R-squared: 0.775, Adjusted R-squared: 0.7684
## F-statistic: 117.1 on 1 and 34 DF, p-value: 1.485e-12
df_merged_m <- add_predictions(ols0, df_merged_m, "predicted_sales0")
# Model 1: Trend + Seasonality
ols1 <- run_model(sales_m ~ numeric_month + seasonal_month, df_merged_m, "Model 1")
##
## Running Model 1
##
## Call:
## lm(formula = formula, data = data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.7403 -0.1590 -0.0135 0.2071 0.4347
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 16.505470 0.260998 63.240 < 2e-16 ***
## numeric_month 0.657412 0.075865 8.666 1.06e-08 ***
## seasonal_monthFebruary -0.001699 0.254032 -0.007 0.995
## seasonal_monthMarch -0.006230 0.254345 -0.024 0.981
## seasonal_monthApril -0.063031 0.254777 -0.247 0.807
## seasonal_monthMay 0.070611 0.255288 0.277 0.785
## seasonal_monthJune 0.037438 0.255855 0.146 0.885
## seasonal_monthJuly 0.138742 0.256462 0.541 0.594
## seasonal_monthAugust -0.010812 0.257098 -0.042 0.967
## seasonal_monthSeptember -0.133330 0.257755 -0.517 0.610
## seasonal_monthOctober -0.058397 0.258427 -0.226 0.823
## seasonal_monthNovember -0.335279 0.254924 -1.315 0.201
## seasonal_monthDecember -0.016084 0.254094 -0.063 0.950
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.311 on 23 degrees of freedom
## Multiple R-squared: 0.8123, Adjusted R-squared: 0.7144
## F-statistic: 8.296 on 12 and 23 DF, p-value: 8.937e-06
df_merged_m <- add_predictions(ols1, df_merged_m, "predicted_sales1")
## Model 2: Backward Stepwise Regression
# Start with the full model (excluding food and bar)
ols2_full <- lm(
sales_m ~ numeric_month + seasonal_month + unemployment + ise + fx_m +
google_m + temp_m + rain_m,
data = df_merged_m
)
# Perform backward stepwise regression
ols2_stepwise <- step(
ols2_full,
direction = "backward",
trace = 1 # Prints the stepwise regression process
)
## Start: AIC=-104.31
## sales_m ~ numeric_month + seasonal_month + unemployment + ise +
## fx_m + google_m + temp_m + rain_m
##
## Df Sum of Sq RSS AIC
## - temp_m 1 0.00008 0.69105 -106.310
## - unemployment 1 0.01523 0.70620 -105.530
## <none> 0.69097 -104.314
## - rain_m 1 0.07578 0.76674 -102.568
## - seasonal_month 11 0.71692 1.40789 -100.691
## - ise 1 0.17179 0.86276 -98.321
## - fx_m 1 0.39083 1.08180 -90.176
## - google_m 1 0.57575 1.26672 -84.495
## - numeric_month 1 2.29957 2.99054 -53.570
##
## Step: AIC=-106.31
## sales_m ~ numeric_month + seasonal_month + unemployment + ise +
## fx_m + google_m + rain_m
##
## Df Sum of Sq RSS AIC
## - unemployment 1 0.02314 0.71419 -107.125
## <none> 0.69105 -106.310
## - rain_m 1 0.07693 0.76798 -104.510
## - seasonal_month 11 0.79109 1.48214 -100.841
## - ise 1 0.17775 0.86880 -100.070
## - fx_m 1 0.43032 1.12137 -90.883
## - google_m 1 0.65543 1.34648 -84.297
## - numeric_month 1 2.42508 3.11613 -54.089
##
## Step: AIC=-107.12
## sales_m ~ numeric_month + seasonal_month + ise + fx_m + google_m +
## rain_m
##
## Df Sum of Sq RSS AIC
## <none> 0.7142 -107.125
## - rain_m 1 0.0940 0.8082 -104.673
## - seasonal_month 11 0.7847 1.4989 -102.437
## - ise 1 0.1606 0.8748 -101.823
## - fx_m 1 0.4343 1.1485 -92.022
## - google_m 1 0.6466 1.3608 -85.916
## - numeric_month 1 4.1118 4.8260 -40.342
# Summary of the final stepwise model
summary(ols2_stepwise)
##
## Call:
## lm(formula = sales_m ~ numeric_month + seasonal_month + ise +
## fx_m + google_m + rain_m, data = df_merged_m)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.31229 -0.08760 -0.01030 0.09029 0.31789
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -52.31160 19.99397 -2.616 0.016981 *
## numeric_month 0.93626 0.08952 10.459 2.54e-09 ***
## seasonal_monthFebruary 0.27615 0.19812 1.394 0.179459
## seasonal_monthMarch -0.33136 0.28825 -1.150 0.264583
## seasonal_monthApril -0.03032 0.20910 -0.145 0.886232
## seasonal_monthMay -0.12629 0.27633 -0.457 0.652850
## seasonal_monthJune -0.16788 0.27401 -0.613 0.547361
## seasonal_monthJuly -0.11655 0.30708 -0.380 0.708483
## seasonal_monthAugust -0.45680 0.33544 -1.362 0.189189
## seasonal_monthSeptember -0.40348 0.27930 -1.445 0.164861
## seasonal_monthOctober -0.21625 0.28064 -0.771 0.450439
## seasonal_monthNovember -0.67606 0.38426 -1.759 0.094603 .
## seasonal_monthDecember -1.52898 0.67026 -2.281 0.034247 *
## ise 7.20920 3.48786 2.067 0.052641 .
## fx_m 2.72323 0.80115 3.399 0.003010 **
## google_m 3.17853 0.76636 4.148 0.000547 ***
## rain_m -0.20297 0.12835 -1.581 0.130285
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.1939 on 19 degrees of freedom
## Multiple R-squared: 0.9397, Adjusted R-squared: 0.889
## F-statistic: 18.52 on 16 and 19 DF, p-value: 2.609e-08
plot(ols2_stepwise)
# Add predictions from the final stepwise model
df_merged_m <- add_predictions(ols2_stepwise, df_merged_m, "predicted_sales2")
# Plot Actual vs Predicted Values
plot_colors <- c("black", "red", "blue", "darkgreen")
line_types <- c(1, 2, 3,4) # Solid, dashed, and dotted lines
par(mfrow = c(1,1))
# the base plot for actual sales
plot(
df_merged_m$month, exp(df_merged_m$sales_m),
type = "p", # Points for actual sales
col = "black", pch = 16,
xlab = "Month",
ylab = "Sales",
main = "Actual vs Predicted Monthly Sales for All Models"
)
# Add the lines for each model
lines(
df_merged_m$month, exp(df_merged_m$predicted_sales0),
col = plot_colors[2],
lty = line_types[2],
lwd = 2
)
lines(
df_merged_m$month, exp(df_merged_m$predicted_sales1),
col = plot_colors[3],
lty = line_types[3],
lwd = 2
)
lines(
df_merged_m$month, exp(df_merged_m$predicted_sales2),
col = plot_colors[4],
lty = 4,
lwd = 2
)
# Add a legend
legend(
"bottomright",
legend = c("Actual Sales", "Model 0", "Model 1", "Model 2 Stepwise"),
col = c("black", plot_colors[2:4]),
lty = c(NA, line_types[2:4]),
pch = c(16, NA, NA, NA),
bty = "n"
)
# Models to compare
models <- list(
"Model trend" = ols0,
"Model trend + season" = ols1,
"Model all covariates step" = ols2_stepwise
)
# Get R^2 and AIC for each model
model_stats <- get_model_stats(models)
# RMSE calculation for the original (exponentiated) scale
rmse_stats <- data.frame(
Model = character(),
RMSE = numeric(),
stringsAsFactors = FALSE
)
# Loop through each model
for (i in seq_along(models)) {
model_name <- names(models)[i]
predicted_column <- paste0("predicted_sales", i - 1) # Adjust column name index
# Calculate RMSE on the original scale
rmse <- calculate_rmse(
observed = exp(df_merged_m$sales_m), # Exponentiate actual values
predicted = exp(df_merged_m[[predicted_column]]) # Exponentiate predicted values
)
# Append results to the RMSE stats table
rmse_stats <- rbind(rmse_stats, data.frame(Model = model_name, RMSE = rmse))
}
# Combine R^2, AIC, and RMSE into one table
combined_stats <- merge(model_stats, rmse_stats, by = "Model")
# View the combined table
print(combined_stats)
## Model R2 AIC RMSE
## 1 Model all covariates step 0.9397340 -2.961086 13229295
## 2 Model trend 0.7750083 14.461543 21374338
## 3 Model trend + season 0.8123201 29.933834 21523336
rmse_ols_m <- rmse_stats$RMSE[3]
rmse_ols_m
## [1] 13229295
# check teh residuals of the best model
checkresiduals(ols2_stepwise$residuals)
##
## Ljung-Box test
##
## data: Residuals
## Q* = 9.5442, df = 7, p-value = 0.2159
##
## Model df: 0. Total lags used: 7
# Weekly Models
head(df_merged_w)
## Clean Data - Drop rows 1-2 because sales are 0 / was not open yet
df_merged_w <- df_merged_w %>% slice(-1, -2)
## Model 0A: Trend only
ols0w <- run_model(sales_w ~ numeric_week, df_merged_w, "Model 0A")
##
## Running Model 0A
##
## Call:
## lm(formula = formula, data = data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.72559 -0.15917 0.01874 0.16674 0.65873
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 14.63582 0.10036 145.84 <2e-16 ***
## numeric_week 0.53273 0.02372 22.46 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.2406 on 153 degrees of freedom
## Multiple R-squared: 0.7673, Adjusted R-squared: 0.7658
## F-statistic: 504.5 on 1 and 153 DF, p-value: < 2.2e-16
df_merged_w <- add_predictions(ols0w, df_merged_w, "predicted_sales0")
## Model 1A: Trend + Seasonality
ols1w <- run_model(sales_w ~ numeric_week + seasonal_month, df_merged_w, "Model 1A")
##
## Running Model 1A
##
## Call:
## lm(formula = formula, data = data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.62472 -0.14172 0.01215 0.16457 0.57707
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 14.67383 0.11499 127.613 <2e-16 ***
## numeric_week 0.53053 0.02537 20.915 <2e-16 ***
## seasonal_monthFebruary 0.01863 0.09432 0.197 0.844
## seasonal_monthMarch -0.07386 0.09266 -0.797 0.427
## seasonal_monthApril -0.04300 0.09278 -0.463 0.644
## seasonal_monthMay 0.05120 0.09279 0.552 0.582
## seasonal_monthJune 0.03179 0.09343 0.340 0.734
## seasonal_monthJuly 0.09360 0.09164 1.021 0.309
## seasonal_monthAugust -0.03924 0.09572 -0.410 0.683
## seasonal_monthSeptember -0.12111 0.09425 -1.285 0.201
## seasonal_monthOctober -0.08215 0.09245 -0.889 0.376
## seasonal_monthNovember -0.05447 0.09654 -0.564 0.574
## seasonal_monthDecember -0.13661 0.09221 -1.481 0.141
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.2394 on 142 degrees of freedom
## Multiple R-squared: 0.7862, Adjusted R-squared: 0.7681
## F-statistic: 43.5 on 12 and 142 DF, p-value: < 2.2e-16
df_merged_w <- add_predictions(ols1w, df_merged_w, "predicted_sales1")
## Model 2A: Experimentation
# Start with the full model (excluding food and bar)
ols2_full_w <- lm(
sales_w ~ numeric_week + seasonal_month + fx_w +
google_w + temp_w + rain_w,
data = df_merged_w
)
# Perform backward stepwise regression
ols2_stepwise_w <- step(
ols2_full_w,
direction = "backward",
trace = 1 # Prints the stepwise regression process
)
## Start: AIC=-470.72
## sales_w ~ numeric_week + seasonal_month + fx_w + google_w + temp_w +
## rain_w
##
## Df Sum of Sq RSS AIC
## - google_w 1 0.0047 5.9773 -472.59
## - temp_w 1 0.0524 6.0250 -471.36
## <none> 5.9726 -470.72
## - seasonal_month 11 0.9898 6.9624 -468.95
## - rain_w 1 0.1643 6.1370 -468.51
## - fx_w 1 1.1370 7.1096 -445.71
## - numeric_week 1 16.1784 22.1510 -269.56
##
## Step: AIC=-472.59
## sales_w ~ numeric_week + seasonal_month + fx_w + temp_w + rain_w
##
## Df Sum of Sq RSS AIC
## - temp_w 1 0.0597 6.0370 -473.05
## <none> 5.9773 -472.59
## - seasonal_month 11 0.9855 6.9628 -470.94
## - rain_w 1 0.1652 6.1426 -470.37
## - fx_w 1 1.1490 7.1264 -447.34
## - numeric_week 1 19.7232 25.7005 -248.52
##
## Step: AIC=-473.05
## sales_w ~ numeric_week + seasonal_month + fx_w + rain_w
##
## Df Sum of Sq RSS AIC
## <none> 6.0370 -473.05
## - rain_w 1 0.2277 6.2648 -469.31
## - seasonal_month 11 1.3008 7.3379 -464.81
## - fx_w 1 1.6010 7.6380 -438.59
## - numeric_week 1 20.3284 26.3655 -246.56
# Summary of the final stepwise model
summary(ols2_stepwise_w)
##
## Call:
## lm(formula = sales_w ~ numeric_week + seasonal_month + fx_w +
## rain_w, data = df_merged_w)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.5797 -0.1243 0.0106 0.1247 0.5622
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -1.941702 2.597439 -0.748 0.4560
## numeric_week 0.542540 0.024988 21.712 < 2e-16 ***
## seasonal_monthFebruary 0.003119 0.082343 0.038 0.9698
## seasonal_monthMarch -0.008609 0.081117 -0.106 0.9156
## seasonal_monthApril 0.058496 0.084093 0.696 0.4878
## seasonal_monthMay 0.160561 0.085157 1.885 0.0614 .
## seasonal_monthJune 0.171277 0.088309 1.940 0.0544 .
## seasonal_monthJuly 0.140843 0.079982 1.761 0.0804 .
## seasonal_monthAugust -0.015579 0.083209 -0.187 0.8518
## seasonal_monthSeptember -0.130180 0.082537 -1.577 0.1170
## seasonal_monthOctober -0.116443 0.081307 -1.432 0.1543
## seasonal_monthNovember -0.011869 0.090153 -0.132 0.8954
## seasonal_monthDecember -0.096493 0.081089 -1.190 0.2361
## fx_w 2.106952 0.345784 6.093 1.01e-08 ***
## rain_w -0.121385 0.052818 -2.298 0.0230 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.2077 on 140 degrees of freedom
## Multiple R-squared: 0.8414, Adjusted R-squared: 0.8255
## F-statistic: 53.04 on 14 and 140 DF, p-value: < 2.2e-16
# Add predictions from the final stepwise model
df_merged_w <- add_predictions(ols2_stepwise_w, df_merged_w, "predicted_sales2")
# Plot Actual vs Predicted Values
# Plot Actual vs Predicted Values
par(mfrow = c(1,1))
plot(
df_merged_w$week, exp(df_merged_w$sales_w),
type = "p", # Points for actual sales
col = "black", pch = 16,
xlab = "Week",
ylab = "Sales",
main = "Actual vs Predicted Weekly Sales for All Models"
)
# Add the lines for each model
lines(
df_merged_w$week, exp(df_merged_w$predicted_sales0),
col = plot_colors[2],
lty = line_types[2],
lwd = 2
)
lines(
df_merged_w$week, exp(df_merged_w$predicted_sales1),
col = plot_colors[3],
lty = line_types[3],
lwd = 2
)
lines(
df_merged_w$week, exp(df_merged_w$predicted_sales2),
col = plot_colors[4],
lty = 4,
lwd = 2
)
# Add a legend
legend(
"bottomright",
legend = c("Actual Sales", "Model 0", "Model 1", "Model 2 Stepwise"),
col = c("black", plot_colors[2:4]),
lty = c(NA, line_types[2:4]),
pch = c(16, NA, NA, NA),
bty = "n"
)
# Models to compare
models_w <- list(
"Model trend" = ols0w,
"Model trend + season" = ols1w,
"Model all covariates step" = ols2_stepwise_w
)
# Models to compare
models_w <- list(
"Model trend" = ols0w,
"Model trend + season" = ols1w,
"Model all covariates step" = ols2_stepwise_w
)
# Get R^2 and AIC for each model
model_stats_w <- get_model_stats(models_w)
# View the results
print(model_stats_w)
## Model R2 AIC
## 1 Model trend 0.7672955 2.217403
## 2 Model trend + season 0.7861620 11.112056
## 3 Model all covariates step 0.8413760 -31.183618
rmse_stats_w <- data.frame(
Model = character(),
RMSE = numeric(),
stringsAsFactors = FALSE
)
# Loop through each model
for (i in seq_along(models_w)) {
model_name <- names(models_w)[i]
predicted_column <- paste0("predicted_sales", i - 1) # Adjust column name index
# Calculate RMSE on the original scale
rmse <- calculate_rmse(
observed = exp(df_merged_w$sales_w), # Exponentiate actual values
predicted = exp(df_merged_w[[predicted_column]]) # Exponentiate predicted values
)
# Append results to the RMSE stats table
rmse_stats_w <- rbind(rmse_stats_w, data.frame(Model = model_name, RMSE = rmse))
}
# Combine R^2, AIC, and RMSE into one table
combined_stats_w <- merge(model_stats_w, rmse_stats_w, by = "Model")
# View the combined table
print(combined_stats_w)
## Model R2 AIC RMSE
## 1 Model all covariates step 0.8413760 -31.183618 4000370
## 2 Model trend 0.7672955 2.217403 4745735
## 3 Model trend + season 0.7861620 11.112056 4668952
rmse_ols_w <- rmse_stats_w$RMSE[3]
rmse_ols_w
## [1] 4000370
checkresiduals(ols2_stepwise_w$residuals)
##
## Ljung-Box test
##
## data: Residuals
## Q* = 80.439, df = 10, p-value = 4.118e-13
##
## Model df: 0. Total lags used: 10
Residuals are autocorrelated and have structure
# Daily Models
head(df_merged_d,25)
# properly start in december
df_merged_d <- df_merged_d %>%
filter(date > "2021-11-30")
head(df_merged_d)
## Model 0: Trend only
ols0d <- run_model(sales_cop ~ numeric_day, df_merged_d, "Model 0A")
##
## Running Model 0A
##
## Call:
## lm(formula = formula, data = data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -15.2083 -0.1814 0.2027 0.5771 1.6749
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 10.5199 0.4261 24.688 <2e-16 ***
## numeric_day 0.6774 0.0695 9.746 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.762 on 1038 degrees of freedom
## Multiple R-squared: 0.08383, Adjusted R-squared: 0.08295
## F-statistic: 94.98 on 1 and 1038 DF, p-value: < 2.2e-16
df_merged_d <- add_predictions(ols0d, df_merged_d, "predicted_sales0")
## Model 1: Trend + Seasonality
ols1d <- run_model(sales_cop ~ numeric_day + seasonal_month + day_of_week, df_merged_d, "Model 1A")
##
## Running Model 1A
##
## Call:
## lm(formula = formula, data = data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -15.1930 -0.1446 0.1520 0.4321 2.1992
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 10.41648 0.46748 22.282 < 2e-16 ***
## numeric_day 0.60222 0.07264 8.290 3.54e-16 ***
## seasonal_monthFebruary 0.37785 0.26287 1.437 0.15091
## seasonal_monthMarch 0.24291 0.25677 0.946 0.34435
## seasonal_monthApril 0.21725 0.26027 0.835 0.40407
## seasonal_monthMay 0.03779 0.25763 0.147 0.88342
## seasonal_monthJune 0.15499 0.26174 0.592 0.55388
## seasonal_monthJuly 0.23454 0.25933 0.904 0.36599
## seasonal_monthAugust 0.25260 0.26021 0.971 0.33190
## seasonal_monthSeptember -0.03710 0.26437 -0.140 0.88843
## seasonal_monthOctober 0.23107 0.26387 0.876 0.38140
## seasonal_monthNovember 0.08953 0.29411 0.304 0.76087
## seasonal_monthDecember -0.71856 0.25824 -2.783 0.00549 **
## day_of_weekTuesday -0.06744 0.19830 -0.340 0.73385
## day_of_weekWednesday 0.22634 0.19767 1.145 0.25247
## day_of_weekThursday 0.47521 0.19734 2.408 0.01621 *
## day_of_weekFriday 1.10918 0.19862 5.584 3.01e-08 ***
## day_of_weekSaturday 0.93769 0.19827 4.729 2.57e-06 ***
## day_of_weekSunday 0.61578 0.19826 3.106 0.00195 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.705 on 1021 degrees of freedom
## Multiple R-squared: 0.1557, Adjusted R-squared: 0.1408
## F-statistic: 10.46 on 18 and 1021 DF, p-value: < 2.2e-16
df_merged_d <- add_predictions(ols1d, df_merged_d, "predicted_sales1")
# Model 2: Backward
head(df_merged_d)
# Start with the full model (excluding food and bar)
ols2_full_d <- lm(
sales_cop ~ numeric_day + seasonal_month + day_of_week + fx +
tmedian + rain_sum,
data = df_merged_d
)
summary(ols2_full_d)
##
## Call:
## lm(formula = sales_cop ~ numeric_day + seasonal_month + day_of_week +
## fx + tmedian + rain_sum, data = df_merged_d)
##
## Residuals:
## Min 1Q Median 3Q Max
## -15.2560 -0.1301 0.1434 0.4180 2.2430
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 3.681458 7.563025 0.487 0.62653
## numeric_day 0.555771 0.078969 7.038 3.58e-12 ***
## seasonal_monthFebruary 0.409437 0.263765 1.552 0.12091
## seasonal_monthMarch 0.254867 0.257099 0.991 0.32176
## seasonal_monthApril 0.215634 0.264563 0.815 0.41523
## seasonal_monthMay 0.002179 0.263006 0.008 0.99339
## seasonal_monthJune 0.125315 0.268566 0.467 0.64088
## seasonal_monthJuly 0.230188 0.260048 0.885 0.37627
## seasonal_monthAugust 0.284052 0.260701 1.090 0.27616
## seasonal_monthSeptember 0.004766 0.265632 0.018 0.98569
## seasonal_monthOctober 0.152011 0.266104 0.571 0.56796
## seasonal_monthNovember -0.092994 0.303301 -0.307 0.75921
## seasonal_monthDecember -0.744359 0.258962 -2.874 0.00413 **
## day_of_weekTuesday -0.063129 0.198020 -0.319 0.74994
## day_of_weekWednesday 0.237591 0.197534 1.203 0.22934
## day_of_weekThursday 0.471793 0.197065 2.394 0.01684 *
## day_of_weekFriday 1.102140 0.198329 5.557 3.50e-08 ***
## day_of_weekSaturday 0.930391 0.198011 4.699 2.98e-06 ***
## day_of_weekSunday 0.615126 0.198134 3.105 0.00196 **
## fx 0.787878 0.880843 0.894 0.37129
## tmedian -0.143825 0.999347 -0.144 0.88559
## rain_sum 0.137507 0.101046 1.361 0.17387
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.703 on 1018 degrees of freedom
## Multiple R-squared: 0.1609, Adjusted R-squared: 0.1436
## F-statistic: 9.294 on 21 and 1018 DF, p-value: < 2.2e-16
# Perform backward stepwise regression
ols2_stepwise_d <- step(
ols2_full_d,
direction = "backward",
trace = 1 # Prints the stepwise regression process
)
## Start: AIC=1128.63
## sales_cop ~ numeric_day + seasonal_month + day_of_week + fx +
## tmedian + rain_sum
##
## Df Sum of Sq RSS AIC
## - tmedian 1 0.060 2951.0 1126.7
## - fx 1 2.319 2953.3 1127.5
## - rain_sum 1 5.368 2956.3 1128.5
## <none> 2951.0 1128.6
## - seasonal_month 11 79.118 3030.1 1134.2
## - numeric_day 1 143.580 3094.6 1176.0
## - day_of_week 6 175.600 3126.6 1176.8
##
## Step: AIC=1126.65
## sales_cop ~ numeric_day + seasonal_month + day_of_week + fx +
## rain_sum
##
## Df Sum of Sq RSS AIC
## - fx 1 2.277 2953.3 1125.5
## <none> 2951.0 1126.7
## - rain_sum 1 6.045 2957.1 1126.8
## - seasonal_month 11 79.429 3030.5 1132.3
## - day_of_week 6 175.581 3126.6 1174.8
## - numeric_day 1 146.691 3097.7 1175.1
##
## Step: AIC=1125.45
## sales_cop ~ numeric_day + seasonal_month + day_of_week + rain_sum
##
## Df Sum of Sq RSS AIC
## <none> 2953.3 1125.5
## - rain_sum 1 15.758 2969.1 1129.0
## - seasonal_month 11 79.049 3032.4 1130.9
## - numeric_day 1 144.596 3097.9 1173.2
## - day_of_week 6 175.164 3128.5 1173.4
# Summary of the final stepwise model
summary(ols2_stepwise_d)
##
## Call:
## lm(formula = sales_cop ~ numeric_day + seasonal_month + day_of_week +
## rain_sum, data = df_merged_d)
##
## Residuals:
## Min 1Q Median 3Q Max
## -15.1816 -0.1390 0.1437 0.4305 2.3591
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 9.54422 0.59782 15.965 < 2e-16 ***
## numeric_day 0.54288 0.07682 7.067 2.93e-12 ***
## seasonal_monthFebruary 0.42246 0.26300 1.606 0.10851
## seasonal_monthMarch 0.24004 0.25622 0.937 0.34905
## seasonal_monthApril 0.17428 0.26036 0.669 0.50339
## seasonal_monthMay -0.03250 0.25883 -0.126 0.90009
## seasonal_monthJune 0.07827 0.26324 0.297 0.76626
## seasonal_monthJuly 0.21366 0.25892 0.825 0.40946
## seasonal_monthAugust 0.27803 0.25988 1.070 0.28494
## seasonal_monthSeptember 0.01683 0.26481 0.064 0.94933
## seasonal_monthOctober 0.17391 0.26444 0.658 0.51091
## seasonal_monthNovember -0.08671 0.30304 -0.286 0.77483
## seasonal_monthDecember -0.75067 0.25804 -2.909 0.00370 **
## day_of_weekTuesday -0.06287 0.19788 -0.318 0.75076
## day_of_weekWednesday 0.24147 0.19735 1.224 0.22139
## day_of_weekThursday 0.47065 0.19692 2.390 0.01703 *
## day_of_weekFriday 1.10202 0.19821 5.560 3.45e-08 ***
## day_of_weekSaturday 0.92892 0.19788 4.694 3.04e-06 ***
## day_of_weekSunday 0.61671 0.19783 3.117 0.00188 **
## rain_sum 0.18939 0.08118 2.333 0.01985 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.702 on 1020 degrees of freedom
## Multiple R-squared: 0.1602, Adjusted R-squared: 0.1446
## F-statistic: 10.24 on 19 and 1020 DF, p-value: < 2.2e-16
# Add predictions from the final stepwise model
df_merged_d <- add_predictions(ols2_stepwise_d, df_merged_d, "predicted_sales2")
# Plot Actual vs Predicted Values
par(mfrow = c(1,1))
plot(
df_merged_d$date, exp(df_merged_d$sales_cop),
type = "p", # Points for actual sales
col = "black", pch = 16,
xlab = "Date",
ylab = "Sales",
main = "Actual vs Predicted Daily Sales for All Models"
)
# Add the lines for each model
lines(
df_merged_d$date, exp(df_merged_d$predicted_sales0),
col = plot_colors[2],
lty = line_types[2],
lwd = 2
)
lines(
df_merged_d$date, exp(df_merged_d$predicted_sales1),
col = plot_colors[3],
lty = line_types[3],
lwd = 2
)
lines(
df_merged_d$date, exp(df_merged_d$predicted_sales2),
col = plot_colors[4],
lty = 4,
lwd = 2
)
# Add a legend
legend(
"topleft",
legend = c("Actual Sales", "Model 0", "Model 1", "Model 2 Stepwise"),
col = c("black", plot_colors[2:4]),
lty = c(NA, line_types[2:4]),
pch = c(16, NA, NA, NA),
bty = "n"
)
# Models to compare
models_d <- list(
"Model trend" = ols0d,
"Model trend + season" = ols1d,
"Model all covariates step" = ols2_stepwise_d
)
# Models to compare
models_d <- list(
"Model trend" = ols0d,
"Model trend + season" = ols1d,
"Model all covariates step" = ols2_stepwise_d
)
# Get R^2 and AIC for each model
model_stats_d <- get_model_stats(models_d)
# RMSE calculation for the original (exponentiated) scale for daily models
rmse_stats_d <- data.frame(
Model = character(),
RMSE = numeric(),
stringsAsFactors = FALSE
)
# Loop through each model
for (i in seq_along(models_d)) {
model_name <- names(models_d)[i]
predicted_column <- paste0("predicted_sales", i - 1) # Adjust column name index
# Calculate RMSE on the original scale
rmse <- calculate_rmse(
observed = exp(df_merged_d$sales_cop), # Exponentiate actual values
predicted = exp(df_merged_d[[predicted_column]]) # Exponentiate predicted values
)
# Append results to the RMSE stats table
rmse_stats_d <- rbind(rmse_stats_d, data.frame(Model = model_name, RMSE = rmse))
}
# Combine R^2, AIC, and RMSE into one table
combined_stats_d <- merge(model_stats_d, rmse_stats_d, by = "Model")
# View the combined table
print(combined_stats_d)
## Model R2 AIC RMSE
## 1 Model all covariates step 0.16020994 4078.846 1421301
## 2 Model trend 0.08382823 4133.380 1845259
## 3 Model trend + season 0.15572908 4082.381 1421549
rmse_ols_d <- rmse_stats_d$RMSE[3]
rmse_ols_d
## [1] 1421301
checkresiduals(ols2_stepwise_d$residuals)
##
## Ljung-Box test
##
## data: Residuals
## Q* = 21.421, df = 10, p-value = 0.01834
##
## Model df: 0. Total lags used: 10
Residuals have autocorrelation and some are too far from mean.
Here we explore non linear models, starting from the simplest to more elaborate models in the end combining some of the used models.
The time series are altered so the visualizations are more understandable, basically we change the date index in the timeseries objects
# re-declare time-series beacause we droped some rows:
# Ensure the 'date' columns are in Date format
df_merged_d$date <- as.Date(df_merged_d$date)
df_merged_w$date <- as.Date(df_merged_w$week)
df_merged_m$date <- as.Date(df_merged_m$month)
# Extract the start date and year for each dataframe
start_d <- min(df_merged_d$date)
start_w <- min(df_merged_w$date)
start_m <- min(df_merged_m$date)
# Extract components for daily, weekly, and monthly start times
start_d_year <- as.numeric(format(start_d, "%Y"))
start_d_day <- as.numeric(format(start_d, "%j")) # Day of the year
start_w_year <- as.numeric(format(start_w, "%Y"))
start_w_week <- as.numeric(format(start_w, "%U")) + 1 # Week number, adding 1 since R starts at week 0
start_m_year <- as.numeric(format(start_m, "%Y"))
start_m_month <- as.numeric(format(start_m, "%m"))
# Declare time series with appropriate frequencies
sales_d_ts <- ts(exp(df_merged_d$sales_cop), start = c(start_d_year, start_d_day), frequency = 365)
sales_w_ts <- ts(exp(df_merged_w$sales_w), start = c(start_w_year, start_w_week), frequency = 52)
sales_m_ts <- ts(exp(df_merged_m$sales_m), start = c(start_m_year, start_m_month), frequency = 12)
food_d_ts <- ts(exp(df_merged_d$food), start = c(start_d_year, start_d_day), frequency = 365)
food_w_ts <- ts(exp(df_merged_w$food_w), start = c(start_w_year, start_w_week), frequency = 52)
food_m_ts <- ts(exp(df_merged_m$food_m), start = c(start_m_year, start_m_month), frequency = 12)
bar_d_ts <- ts(exp(df_merged_d$bar), start = c(start_d_year, start_d_day), frequency = 365)
bar_w_ts <- ts(exp(df_merged_w$bar_w), start = c(start_w_year, start_w_week), frequency = 52)
bar_m_ts <- ts(exp(df_merged_m$bar_m), start = c(start_m_year, start_m_month), frequency = 12)
# Verify the created time series
par(mfrow=c(1,1))
plot(sales_d_ts)
plot(sales_w_ts)
plot(sales_m_ts)
plot(food_d_ts)
plot(food_w_ts)
plot(food_m_ts)
plot(bar_d_ts)
plot(bar_w_ts)
plot(bar_m_ts)
Here we fill the sales = 0 values with the mean of the two adjacent dates. This in order to have smoother models. The dates with sales = 0 are dates that are national holiday like christmas or new years, or inventory day in which the kitchen cannot operate so the sales are 0.
# Function to replace 1s with the mean of previous and next observations
fill_ones <- function(ts_data) {
# Convert time series to numeric vector
ts_vec <- as.numeric(ts_data)
# Loop through and replace 1s
for (i in seq_along(ts_vec)) {
if (ts_vec[i] == 1) {
# Check boundaries to avoid indexing issues
prev_val <- ifelse(i > 1, ts_vec[i - 1], NA)
next_val <- ifelse(i < length(ts_vec), ts_vec[i + 1], NA)
# Replace with mean of previous and next, ignoring NA
ts_vec[i] <- mean(c(prev_val, next_val), na.rm = TRUE)
}
}
# Return as time series with original attributes
ts(ts_vec, start = start(ts_data), frequency = frequency(ts_data))
}
# Apply the function
sales_d_ts <- fill_ones(sales_d_ts)
sales_w_ts <- fill_ones(sales_w_ts)
sales_m_ts <- fill_ones(sales_m_ts)
food_d_ts <- fill_ones(food_d_ts)
food_w_ts <- fill_ones(food_w_ts)
food_m_ts <- fill_ones(food_m_ts)
bar_d_ts <- fill_ones(bar_d_ts)
bar_w_ts <- fill_ones(bar_w_ts)
bar_m_ts <- fill_ones(bar_m_ts)
# Some simple plots
plot(sales_m_ts)
plot(cumsum(sales_m_ts)) #Returns a vector whose elements are the cumulative sums
# Bass model
bm_m<-BM(sales_m_ts,display = T) # show graphical view of results / display = True
summary(bm_m)
## Call: ( Standard Bass Model )
##
## BM(series = sales_m_ts, display = T)
##
## Residuals:
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -94382506 -37311459 -12636337 -10394220 24066002 75506534
##
## Coefficients:
## Estimate Std.Error Lower Upper p-value
## m 4.664349e+09 2.000317e+08 4.272294e+09 5.056404e+09 4.48e-22 ***
## p 8.332383e-03 2.359472e-04 7.869934e-03 8.794831e-03 8.52e-28 ***
## q 8.966748e-02 5.371447e-03 7.913963e-02 1.001953e-01 1.18e-17 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error 47282474 on 33 degrees of freedom
## Multiple R-squared: 0.999374 Residual sum of squares: 7.377587e+16
bm_m$coefficients['m'] - sum(sales_m_ts)
## m
## 1148560231
According to this, there are only 1m cop left to sell, this is less than a year / seems wrong. Fits well but the 30- onward is wierd + sales might not be declining yet. Still reflects the innovation and copying in some sense
Also the restaurants rely in word of mouth to reach full stage m = 4.664.000.000 COP, i.e 1 mm EUR approx. / The restaurant has sold 3.515.788.885/ According to this only in 1 year it should extinguish sells p, innovation: 0.832% indicates that the adoption rate due to external influence is relatively low, but not uncommon for many markets. - it is actually relativly innovative q: (8.96%) suggests that imitation plays a larger role than innovation in driving adoption in this market
pred_bm_m<- predict(bm_m, newx=c(1:length(sales_m_ts)))
pred_bm_m <- ts(pred_bm_m, start = start(sales_m_ts), frequency = frequency(sales_m_ts))
pred.inst_bm_m <- make.instantaneous(pred_bm_m)
pred.inst_bm_m <- ts(pred.inst_bm_m, start = start(sales_m_ts), frequency = frequency(sales_m_ts))
# plot
plot(sales_m_ts, type = "p", col = "black", pch = 16, cex = 0.7,
xlab = "Month", ylab = "Monthly Sales", main = "Actual vs Fitted Sales")
# Add the fitted values as a line
lines(pred.inst_bm_m, col = "red", lwd = 2)
# Add a legend
legend("topleft", legend = c("Actual Values", "Fitted Values"),
col = c("black", "red"), pch = c(16, NA), lty = c(NA, 1), lwd = c(NA, 2))
# check residuals
res_bm_m <- sales_m_ts - pred.inst_bm_m
tsdisplay(res_bm_m)
Residuals have some structure and 2 lag has correlation.
# Calculate RMSE for Bass Model predictions
rmse_bm_m <- calculate_rmse(observed = sales_m_ts, predicted = pred.inst_bm_m)
# Print the RMSE
cat("RMSE for Bass Model Predictions:", rmse_bm_m, "\n")
## RMSE for Bass Model Predictions: 18498870
bm_w<-BM(sales_w_ts,display = T) # show graphical view of results / display = True
summary(bm_w)
## Call: ( Standard Bass Model )
##
## BM(series = sales_w_ts, display = T)
##
## Residuals:
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -90378130 -33009033 -13970791 -8913677 19483237 72394978
##
## Coefficients:
## Estimate Std.Error Lower Upper p-value
## m 4.694285e+09 9.071530e+07 4.516486e+09 4.872084e+09 2.04e-98 ***
## p 2.077484e-03 2.457719e-05 2.029314e-03 2.125654e-03 1.08e-129 ***
## q 1.989569e-02 5.380539e-04 1.884113e-02 2.095026e-02 7.05e-78 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error 40676860 on 152 degrees of freedom
## Multiple R-squared: 0.999528 Residual sum of squares: 2.515003e+17
bm_w$coefficients['m'] - sum(sales_w_ts)
## m
## 1178496087
# results are similar in terms of m, p and w are in other scale
#because they are in different time stamp
bm_m$coefficients['q'] / bm_w$coefficients['q'] # they are approx 4 times
## q
## 4.506878
bm_m$coefficients['p'] / bm_w$coefficients['p'] # they are approx 4 times
## p
## 4.010805
# which makes sense
Coefficients are approximatly 4 times the ones of the monthly model, making sense because there are 4 weeks in a month. While market potential is similar.
# Prediction
pred_bm_w<- predict(bm_w, newx=c(1:length(sales_w_ts)))
pred_bm_w <- ts(pred_bm_w, start = start(sales_w_ts), frequency = frequency(sales_w_ts))
pred.inst_bm_w <- make.instantaneous(pred_bm_w)
pred.inst_bm_w <- ts(pred.inst_bm_w, start = start(sales_w_ts), frequency = frequency(sales_w_ts))
# plot
plot(sales_w_ts, type = "p", col = "black", pch = 16, cex = 0.7,
xlab = "Week", ylab = "Weekly Sales", main = "Actual vs Fitted Sales")
# Add the fitted values as a line
lines(pred.inst_bm_w, col = "red", lwd = 2)
# Add a legend
legend("topleft", legend = c("Actual Values", "Fitted Values"),
col = c("black", "red"), pch = c(16, NA), lty = c(NA, 1), lwd = c(NA, 2))
# check residuals
res_bm_w <- sales_w_ts - pred.inst_bm_w
tsdisplay(res_bm_w)
Residuals have some structure and 2 lag has correlation, with clear trend and structure in the residuals
# RMSE
# Calculate RMSE for Bass Model predictions
rmse_bm_w <- calculate_rmse(observed = sales_w_ts, predicted = pred.inst_bm_w)
# Print the RMSE
cat("RMSE for Bass Model Predictions:", rmse_bm_w, "\n")
## RMSE for Bass Model Predictions: 4542019
bm_d <- BM(
sales_d_ts,
prelimestimates = c(1.2 * sum(sales_d_ts), 0.005, 0.5), # Adjust these estimates
display = TRUE
)
summary(bm_d)
## Call: ( Standard Bass Model )
##
## BM(series = sales_d_ts, prelimestimates = c(1.2 * sum(sales_d_ts),
## 0.005, 0.5), display = TRUE)
##
## Residuals:
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -81086932 -29957254 -10704557 -7197438 17139516 79381813
##
## Coefficients:
## Estimate Std.Error Lower Upper p-value
## m 4.631642e+09 3.177263e+07 4.569369e+09 4.693916e+09 0 ***
## p 3.258569e-04 1.314186e-06 3.232811e-04 3.284326e-04 0 ***
## q 2.847155e-03 2.754612e-05 2.793166e-03 2.901144e-03 0 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error 34753453 on 1037 degrees of freedom
## Multiple R-squared: 0.999642 Residual sum of squares: 1.252491e+18
bm_d$coefficients['m'] - sum(sales_d_ts)
## m
## 1178708770
# results are similar in terms of m, p and w are in other scale
#because they are in different time stamp
bm_w$coefficients['q'] / bm_d$coefficients['q'] # they are approx 7 times
## q
## 6.987921
bm_w$coefficients['p'] / bm_d$coefficients['p'] # they are approx 7 times
## p
## 6.375449
Coefficients are approximately 1:7 scale of the ones in the weekly model, making sense. The market potential is also similar in order of magnitude.
# Prediction
pred_bm_d <- predict(bm_d, newx = c(1:length(sales_d_ts)))
pred_bm_d <- ts(pred_bm_d, start = start(sales_d_ts), frequency = frequency(sales_d_ts))
pred.inst_bm_d <- make.instantaneous(pred_bm_d)
pred.inst_bm_d <- ts(pred.inst_bm_d, start = start(sales_d_ts), frequency = frequency(sales_d_ts))
# Plot actual vs fitted sales for daily data
plot(sales_d_ts, type = "p", col = "black", pch = 16, cex = 0.7,
xlab = "Day", ylab = "Daily Sales", main = "Actual vs Fitted Sales (Daily)")
# Add the fitted values as a line
lines(pred.inst_bm_d, col = "red", lwd = 2)
# Add a legend
legend("topleft", legend = c("Actual Values", "Fitted Values"),
col = c("black", "red"), pch = c(16, NA), lty = c(NA, 1), lwd = c(NA, 2))
# Check residuals
res_bm_d <- sales_d_ts - pred.inst_bm_d
tsdisplay(res_bm_d)
Residuals don not seem stationary, or at least they have a lot of autocorrelation.
# Calculate RMSE for Bass Model predictions (daily data)
rmse_bm_d <- calculate_rmse(observed = sales_d_ts, predicted = pred.inst_bm_d)
# Print the RMSE
cat("RMSE for Daily Bass Model Predictions:", rmse_bm_d, "\n")
## RMSE for Daily Bass Model Predictions: 1651896
Bass model assumes that every product succeeds and the sales saturate to the steady state level. However, most new products fail in reality.
The market potential m is constant along the whole life cycle.
Bass model predictions works well only after the scale inflection point. if sales of a category goes up and up like a J-curve, it can over estimate the overall market size.
It is a model for products with a limited life cycle: needs a hypothesis.
Another drawback of Bass model is that the diffusion pattern in not affected by marketing mix variables like price or advertising.
The generalized Bass model extends the original Bass model allowing the roles of marketing mix value.
Bass model is used to forecast the adoption of a new product and to predict the sales, since it determines the shape of the curve of a model that represent the cumulative adoption of a new product. The Generalized Bass model extends the original Bass model by incorporating marketing mix variables. We can know the effect of pricing, promotions on the new product diffusion curve. It is more flexible than the original Bass model.
m <- 4.451570e+09
p <- 8.472917e-03
q <- 9.415625e-02
GBM_monthly_sales <- GBM(
sales_m_ts,
shock = 'exp',
nshock = 1,
#prelimestimates = c(m,p,q, 12, 0.1, -0.1)
prelimestimates = c(m,p,q, 10, 0.1, 2)
#prelimestimates = c(m,p,q, 11, 15, -0.1)
)
summary(GBM_monthly_sales)
## Call: ( Generalized Bass model with 1 Exponential shock )
##
## GBM(series = sales_m_ts, shock = "exp", nshock = 1, prelimestimates = c(m,
## p, q, 10, 0.1, 2))
##
## Residuals:
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -27853732 -10753200 -127747 -619963 6529609 25477494
##
## Coefficients:
## Estimate Std.Error Lower Upper p-value
## m 1.004455e+10 1.683462e+10 -2.295071e+10 4.303980e+10 5.55e-01
## p 2.360904e-03 3.666121e-03 -4.824562e-03 9.546370e-03 5.24e-01
## q 4.120966e-02 6.359504e-02 -8.343433e-02 1.658536e-01 5.22e-01
## a1 5.329896e+00 2.786829e-01 4.783688e+00 5.876104e+00 2.35e-18 ***
## b1 -9.260056e-02 1.382273e-01 -3.635212e-01 1.783200e-01 5.08e-01
## c1 1.846342e+00 8.143514e-01 2.502422e-01 3.442441e+00 3.07e-02 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error 13902026 on 30 degrees of freedom
## Multiple R-squared: 0.999866 Residual sum of squares: 5.79799e+15
pred_GBM_monthly_sales<- predict(GBM_monthly_sales, newx=c(1:60))
pred_GBM_monthly_sales.inst<- make.instantaneous(pred_GBM_monthly_sales)
# Montly model
ggm1 <- GGM(sales_m_ts, mt='base', display = T)
## Warning in sqrt((1 - exp(-(pc + qc) * t))/(1 + (qc/pc) * exp(-(pc + qc) * :
## NaNs produced
## Warning in sqrt((1 - exp(-(pc + qc) * t))/(1 + (qc/pc) * exp(-(pc + qc) * :
## NaNs produced
## Warning in sqrt((1 - exp(-(pc + qc) * t))/(1 + (qc/pc) * exp(-(pc + qc) * :
## NaNs produced
## Warning in sqrt((1 - exp(-(pc + qc) * t))/(1 + (qc/pc) * exp(-(pc + qc) * :
## NaNs produced
## Warning in sqrt((1 - exp(-(pc + qc) * t))/(1 + (qc/pc) * exp(-(pc + qc) * :
## NaNs produced
## Warning in sqrt((1 - exp(-(pc + qc) * t))/(1 + (qc/pc) * exp(-(pc + qc) * :
## NaNs produced
## Warning in sqrt((1 - exp(-(pc + qc) * t))/(1 + (qc/pc) * exp(-(pc + qc) * :
## NaNs produced
## Warning in sqrt((1 - exp(-(pc + qc) * t))/(1 + (qc/pc) * exp(-(pc + qc) * :
## NaNs produced
ggm2 <- GGM(sales_m_ts, mt= function(x) pchisq(x,10),display = T)
summary(ggm1)
## Call: ( Guseo Guidolin Model )
##
## GGM(series = sales_m_ts, mt = "base", display = T)
##
## Residuals:
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -27544719 -8809035 742251 -337148 7701034 23238738
##
## Coefficients:
## Estimate Std.Error Lower Upper p-value
## K 9.208740e+09 2.454321e+09 4.398360e+09 1.401912e+10 7.24e-04 ***
## pc 2.617639e-02 3.002777e-03 2.029106e-02 3.206173e-02 7.64e-10 ***
## qc 2.453426e-01 3.079890e-02 1.849779e-01 3.057074e-01 5.41e-09 ***
## ps 6.782088e-03 1.329165e-03 4.176972e-03 9.387204e-03 1.60e-05 ***
## qs 3.849713e-02 8.943356e-03 2.096848e-02 5.602579e-02 1.56e-04 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error 13318300 on 31 degrees of freedom
## Multiple R-squared: 0.999873 Residual sum of squares: 5.498691e+15
summary(ggm2)
## Call: ( Guseo Guidolin Model )
##
## GGM(series = sales_m_ts, mt = function(x) pchisq(x, 10), display = T)
##
## Residuals:
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -55479254 -21799576 5818484 25726804 63528263 144216338
##
## Coefficients:
## Estimate Std.Error Lower Upper p-value
## K 6.606326e+10 8.818450e+11 -1.662321e+12 1.794448e+12 0.941
## ps 1.115796e-03 1.526548e-02 -2.880400e-02 3.103559e-02 0.942
## qs 1.615629e-02 2.705220e-02 -3.686506e-02 6.917763e-02 0.554
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error 64933559 on 33 degrees of freedom
## Multiple R-squared: 0.99678 Residual sum of squares: 1.391401e+17
# try different functions for market potential
ggm3 <- GGM(sales_m_ts, mt= function(x) log(x),display = T)
ggm4 <- GGM(sales_m_ts, mt= function(x) (x)**(1/1.05),display = T)
summary(ggm3)
## Call: ( Guseo Guidolin Model )
##
## GGM(series = sales_m_ts, mt = function(x) log(x), display = T)
##
## Residuals:
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -51543377 -17143219 -1444208 -3434599 7197387 38682408
##
## Coefficients:
## Estimate Std.Error Lower Upper p-value
## K 1.484699e+09 7.664004e+07 1.334488e+09 1.634911e+09 1.35e-19 ***
## ps 1.462118e-02 4.299105e-04 1.377857e-02 1.546379e-02 2.86e-27 ***
## qs 4.575437e-02 4.515590e-03 3.690397e-02 5.460476e-02 1.16e-11 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error 23487767 on 33 degrees of freedom
## Multiple R-squared: 0.999579 Residual sum of squares: 1.820528e+16
summary(ggm4)
## Call: ( Guseo Guidolin Model )
##
## GGM(series = sales_m_ts, mt = function(x) (x)^(1/1.05), display = T)
##
## Residuals:
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -33877044 -9900757 -986705 -1517515 10675797 29488516
##
## Coefficients:
## Estimate Std.Error Lower Upper p-value
## K 1.156377e+09 1.344628e+09 -1.479046e+09 3.791800e+09 3.96e-01
## ps 7.688684e-03 9.373693e-03 -1.068342e-02 2.606078e-02 4.18e-01
## qs -7.001165e-02 1.984748e-03 -7.390169e-02 -6.612162e-02 8.84e-28 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error 16905824 on 33 degrees of freedom
## Multiple R-squared: 0.999782 Residual sum of squares: 9.431627e+15
K <- 7.683785e+09
pc <- 2.698613e-02
qc <- 2.582412e-01
ps <- 7.731763e-03
qs <- 4.508202e-02
# predictions
pred_ggm_m <- predict(ggm1, newx = c(1:length(sales_m_ts)))
pred_ggm_m <- ts(pred_ggm_m, start = start(sales_m_ts), frequency = frequency(sales_m_ts))
pred.inst_ggm_m <- make.instantaneous(pred_ggm_m)
pred.inst_ggm_m <- ts(pred.inst_ggm_m, start = start(sales_m_ts), frequency = frequency(sales_m_ts))
# Plot actual vs fitted sales for monthly data
plot(sales_m_ts, type = "p", col = "black", pch = 16, cex = 0.7,
xlab = "Month", ylab = "Monthly Sales", main = "Actual vs Fitted Sales (GGM Model)")
# Add the fitted values as a line
lines(pred.inst_ggm_m, col = "red", lwd = 2)
# Add a legend
legend("topleft", legend = c("Actual Values", "Fitted Values (GGM)"),
col = c("black", "red"), pch = c(16, NA), lty = c(NA, 1), lwd = c(NA, 2))
#Analysis of residuals
res_GGM_m<- sales_m_ts - pred.inst_ggm_m
tsdisplay(res_GGM_m)
Residuals look stationary for this model
# Residuals somehow are kind of stationary
# check for stationarity of residuals
adf_test <- adf.test(res_GGM_m)
print(adf_test) # if p-val < alpha, series stationary
##
## Augmented Dickey-Fuller Test
##
## data: res_GGM_m
## Dickey-Fuller = -4.1, Lag order = 3, p-value = 0.01708
## alternative hypothesis: stationary
# so with this model we achieve stationary series
# check for autocorrelation in residuals
Box.test(res_GGM_m, lag = 10, type = "Ljung-Box") # h0 res indep
##
## Box-Ljung test
##
## data: res_GGM_m
## X-squared = 16.263, df = 10, p-value = 0.09234
# p-val > alpha => fail to reject h0, so residuals seem indep
Residuals are likeley stationary
# Calculate RMSE for ggm1
rmse_ggm1 <- calculate_rmse(observed = sales_m_ts, predicted = pred.inst_ggm_m)
# Print RMSE for ggm1
cat("RMSE for GGM Model 1 (Base):", rmse_ggm1, "\n")
## RMSE for GGM Model 1 (Base): 11759505
# Weekly
ggm1_w <- GGM(sales_w_ts, mt='base', display = T)
ggm2_w <- GGM(sales_w_ts, mt= function(x) pchisq(x,25),display = T)
summary(ggm1_w) # this one is better
## Call: ( Guseo Guidolin Model )
##
## GGM(series = sales_w_ts, mt = "base", display = T)
##
## Residuals:
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -30941134 -8439496 1818273 88704 8469414 21113528
##
## Coefficients:
## Estimate Std.Error Lower Upper p-value
## K 6.454454e+09 5.261291e+08 5.423260e+09 7.485648e+09 2.13e-24 ***
## pc 3.542302e-04 2.760784e-05 3.001199e-04 4.083406e-04 6.66e-26 ***
## qc 2.078354e-02 1.326387e-03 1.818387e-02 2.338321e-02 2.16e-33 ***
## ps 1.149074e-02 3.238499e-04 1.085601e-02 1.212547e-02 7.53e-75 ***
## qs 3.110864e-02 2.164583e-03 2.686613e-02 3.535114e-02 5.35e-30 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error 12271825 on 150 degrees of freedom
## Multiple R-squared: 0.999876 Residual sum of squares: 2.258965e+16
summary(ggm2_w)
## Call: ( Guseo Guidolin Model )
##
## GGM(series = sales_w_ts, mt = function(x) pchisq(x, 25), display = T)
##
## Residuals:
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -34347002 -18821378 -462825 7370123 29484361 96549738
##
## Coefficients:
## Estimate Std.Error Lower Upper p-value
## K 5.300184e+09 1.334365e+08 5.038653e+09 5.561715e+09 3.66e-82 ***
## ps 2.098620e-03 2.848830e-05 2.042784e-03 2.154456e-03 7.94e-121 ***
## qs 1.651520e-02 5.058330e-04 1.552378e-02 1.750661e-02 1.41e-70 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error 33658058 on 152 degrees of freedom
## Multiple R-squared: 0.999055 Residual sum of squares: 1.721955e+17
# try different functions for market potential
ggm3_w <- GGM(sales_w_ts, mt= function(x) log(x),display = T)
ggm4_w <- GGM(sales_w_ts, mt= function(x) (x)**(1/1.05),display = T)
summary(ggm3_w)
## Call: ( Guseo Guidolin Model )
##
## GGM(series = sales_w_ts, mt = function(x) log(x), display = T)
##
## Residuals:
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -62209555 -22307652 -3737433 -4861230 11043715 47916399
##
## Coefficients:
## Estimate Std.Error Lower Upper p-value
## K 9.913091e+08 1.987146e+07 9.523618e+08 1.030256e+09 3.92e-96 ***
## ps 3.072264e-03 3.266103e-05 3.008250e-03 3.136279e-03 1.29e-136 ***
## qs 1.365598e-02 4.799543e-04 1.271528e-02 1.459667e-02 9.15e-63 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error 26307052 on 152 degrees of freedom
## Multiple R-squared: 0.999423 Residual sum of squares: 1.051933e+17
summary(ggm4_w) # better shaped but less significant
## Call: ( Guseo Guidolin Model )
##
## GGM(series = sales_w_ts, mt = function(x) (x)^(1/1.05), display = T)
##
## Residuals:
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -34605136 -10571355 2199183 -413132 8482721 26917353
##
## Coefficients:
## Estimate Std.Error Lower Upper p-value
## K 9.122958e+07 1.292809e+07 6.589099e+07 1.165682e+08 5.66e-11 ***
## ps 6.642281e-03 1.054715e-03 4.575079e-03 8.709484e-03 3.09e-09 ***
## qs -1.898405e-02 2.042294e-04 -1.938433e-02 -1.858377e-02 7.61e-136 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error 14257525 on 152 degrees of freedom
## Multiple R-squared: 0.99983 Residual sum of squares: 3.089811e+16
# predictions
pred_ggm_w <- predict(ggm1_w, newx = c(1:length(sales_w_ts)))
pred_ggm_w <- ts(pred_ggm_w, start = start(sales_w_ts), frequency = frequency(sales_w_ts))
pred.inst_ggm_w <- make.instantaneous(pred_ggm_w)
pred.inst_ggm_w <- ts(pred.inst_ggm_w, start = start(sales_w_ts), frequency = frequency(sales_w_ts))
# Plot actual vs fitted sales for weekly data
plot(sales_w_ts, type = "p", col = "black", pch = 16, cex = 0.7,
xlab = "Week", ylab = "Weekly Sales", main = "Actual vs Fitted Sales (GGM Model)")
# Add the fitted values as a line
lines(pred.inst_ggm_w, col = "red", lwd = 2)
# Add a legend
legend("topleft", legend = c("Actual Values", "Fitted Values (GGM)"),
col = c("black", "red"), pch = c(16, NA), lty = c(NA, 1), lwd = c(NA, 2))
# Analysis of residuals
res_GGM_w <- sales_w_ts - pred.inst_ggm_w
tsdisplay(res_GGM_w)
# Check for stationarity of residuals
adf_test_w <- adf.test(res_GGM_w)
## Warning in adf.test(res_GGM_w): p-value smaller than printed p-value
print(adf_test_w) # if p-value < alpha, series is stationary
##
## Augmented Dickey-Fuller Test
##
## data: res_GGM_w
## Dickey-Fuller = -4.3175, Lag order = 5, p-value = 0.01
## alternative hypothesis: stationary
# Check for autocorrelation in residuals
box_test_w <- Box.test(res_GGM_w, lag = 10, type = "Ljung-Box")
print(box_test_w) # if p-value > alpha, residuals are independent
##
## Box-Ljung test
##
## data: res_GGM_w
## X-squared = 55.579, df = 10, p-value = 2.462e-08
Series is stationary according to tests, but clearly has strong autocorrelation
# RMSE
rmse_ggm_w <- calculate_rmse(observed = sales_w_ts, predicted = pred.inst_ggm_w)
# Print the RMSE
cat("RMSE for Weekly GGM Model Predictions:", rmse_ggm_w, "\n")
## RMSE for Weekly GGM Model Predictions: 3453199
# Daily GGM
# Scaling the sales data
sales_min <- min(sales_d_ts)
sales_max <- max(sales_d_ts)
sales_scaled <- (sales_d_ts - sales_min) / (sales_max - sales_min)
# View scaled data
summary(sales_scaled)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.0000 0.1904 0.2867 0.3217 0.4343 1.0000
plot(sales_scaled, type = "l", main = "Scaled Daily Sales", xlab = "Day", ylab = "Scaled Sales")
We re-scale the data because else the model won’t converge
# Fit GGM models using scaled data
ggm1_d <- GGM(sales_scaled, mt = 'base', display = T)
## Warning in sqrt((1 - exp(-(pc + qc) * t))/(1 + (qc/pc) * exp(-(pc + qc) * :
## NaNs produced
## Warning in sqrt((1 - exp(-(pc + qc) * t))/(1 + (qc/pc) * exp(-(pc + qc) * :
## NaNs produced
## Warning in sqrt((1 - exp(-(pc + qc) * t))/(1 + (qc/pc) * exp(-(pc + qc) * :
## NaNs produced
## Warning in sqrt((1 - exp(-(pc + qc) * t))/(1 + (qc/pc) * exp(-(pc + qc) * :
## NaNs produced
ggm2_d <- GGM(sales_scaled, mt = function(x) pchisq(x, 10), display = T)
ggm3_d <- GGM(sales_scaled, mt = function(x) log(x), display = T)
ggm4_d <- GGM(sales_scaled, mt = function(x) (x)^(1/1.05), display = T)
# Summarize models
summary(ggm1_d) # Base model
## Call: ( Guseo Guidolin Model )
##
## GGM(series = sales_scaled, mt = "base", display = T)
##
## Residuals:
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -3.40144 -0.89220 0.18363 0.04369 0.98854 2.48411
##
## Coefficients:
## Estimate Std.Error Lower Upper p-value
## K 1.174310e+03 4.487889e+02 294.699957932 2.053920e+03 9.01e-03 **
## pc 2.384956e-05 1.641538e-05 -0.000008324 5.602311e-05 1.47e-01
## qc 2.113894e-03 1.971344e-04 0.001727518 2.500270e-03 1.64e-25 ***
## ps 1.722612e-03 5.791737e-05 0.001609096 1.836128e-03 5.37e-141 ***
## qs 2.948140e-03 1.218250e-04 0.002709367 3.186912e-03 6.84e-103 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error 1.202014 on 1035 degrees of freedom
## Multiple R-squared: 0.999862 Residual sum of squares: 1495.407
summary(ggm2_d) # Chi-squared
## Call: ( Guseo Guidolin Model )
##
## GGM(series = sales_scaled, mt = function(x) pchisq(x, 10), display = T)
##
## Residuals:
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -7.8650 -2.9043 -0.9269 -0.6900 1.6614 7.6964
##
## Coefficients:
## Estimate Std.Error Lower Upper p-value
## K 4.487566e+02 3.078722e+00 4.427224e+02 4.547908e+02 0 ***
## ps 3.257391e-04 1.314285e-06 3.231632e-04 3.283151e-04 0 ***
## qs 2.848319e-03 2.756036e-05 2.794302e-03 2.902336e-03 0 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error 3.369694 on 1037 degrees of freedom
## Multiple R-squared: 0.998917 Residual sum of squares: 11774.97
summary(ggm3_d) # Log transformation
## Call: ( Guseo Guidolin Model )
##
## GGM(series = sales_scaled, mt = function(x) log(x), display = T)
##
## Residuals:
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -6.0614 -2.1300 -0.4001 -0.4470 1.0521 5.8832
##
## Coefficients:
## Estimate Std.Error Lower Upper p-value
## K 6.700548e+01 4.568457e-01 66.110077198 6.790088e+01 0 ***
## ps 4.296123e-04 1.568576e-06 0.000426538 4.326867e-04 0 ***
## qs 2.228507e-03 2.530593e-05 0.002178908 2.278106e-03 0 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error 2.489536 on 1037 degrees of freedom
## Multiple R-squared: 0.999409 Residual sum of squares: 6427.109
summary(ggm4_d) # Power transformation
## Call: ( Guseo Guidolin Model )
##
## GGM(series = sales_scaled, mt = function(x) (x)^(1/1.05), display = T)
##
## Residuals:
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -3.33513 -0.98706 0.18913 0.07835 0.93237 3.00796
##
## Coefficients:
## Estimate Std.Error Lower Upper p-value
## K 0.963478949 3.373281e-02 0.897363853 1.029594046 7.67e-133 ***
## ps 0.001651335 6.856616e-05 0.001516948 0.001785722 3.77e-102 ***
## qs -0.003188738 1.158087e-05 -0.003211436 -0.003166040 0.00e+00 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error 1.307058 on 1037 degrees of freedom
## Multiple R-squared: 0.999837 Residual sum of squares: 1771.61
Predict on the best model based on fit and p-values We select model 1
# Prediction using GGM model
pred_ggm_d <- predict(ggm1_d, newx = c(1:length(sales_scaled)))
pred_ggm_d <- ts(pred_ggm_d, start = start(sales_scaled), frequency = frequency(sales_scaled))
pred.inst_ggm_d <- make.instantaneous(pred_ggm_d)
pred.inst_ggm_d <- ts(pred.inst_ggm_d, start = start(sales_scaled), frequency = frequency(sales_scaled))
# Re-scale predictions back to the original scale
pred_original_scale <- (pred.inst_ggm_d * (sales_max - sales_min)) + sales_min
# Plot actual vs fitted sales (original scale)
plot(sales_d_ts, type = "p", col = "black", pch = 16, cex = 0.7,
xlab = "Day", ylab = "Daily Sales", main = "Actual vs Fitted Sales (Original Scale)")
lines(pred_original_scale, col = "red", lwd = 2)
legend("topleft", legend = c("Actual Values", "Fitted Values (GGM, Original Scale)"),
col = c("black", "red"), pch = c(16, NA), lty = c(NA, 1), lwd = c(NA, 2))
# Analysis of residuals
res_GGM_d <- sales_d_ts - pred_original_scale
tsdisplay(res_GGM_d, main = "Residuals of GGM Model")
Residuals dont look stationary
# Check for stationarity of residuals
adf_test_d <- adf.test(res_GGM_d)
## Warning in adf.test(res_GGM_d): p-value smaller than printed p-value
print(adf_test_d) # If p-value < alpha, series is stationary
##
## Augmented Dickey-Fuller Test
##
## data: res_GGM_d
## Dickey-Fuller = -7.8042, Lag order = 10, p-value = 0.01
## alternative hypothesis: stationary
# according to this, they are stationary
# Check for autocorrelation in residuals
box_test_d <- Box.test(res_GGM_d, lag = 10, type = "Ljung-Box")
print(box_test_d) # If p-value > alpha, residuals are indep
##
## Box-Ljung test
##
## data: res_GGM_d
## X-squared = 1403.5, df = 10, p-value < 2.2e-16
Residuals look stationary in the test but hey have serial correlation
# Calculate RMSE for GGM model predictions (original scale)
rmse_ggm_d <- calculate_rmse(observed = sales_d_ts, predicted = pred_original_scale)
# Print the RMSE
cat("RMSE for Daily GGM Model Predictions (Original Scale):", rmse_ggm_d, "\n")
## RMSE for Daily GGM Model Predictions (Original Scale): 1600510
# adjust timeseries to ensure date consistency
sales_m_ts <- ts(sales_m_ts, frequency=12, start=c(2021, 11))
hw1_m<- hw(sales_m_ts, seasonal="additive")
hw2_m<- hw(sales_m_ts, seasonal="multiplicative")
# prediction
fitted_hw1 <- hw1_m$fitted
fitted_hw2 <- hw2_m$fitted
We now plot the models
# Create a data frame for ggplot
plot_data <- data.frame(
Time = time(sales_m_ts),
Actual = as.numeric(sales_m_ts),
Fitted_Additive = as.numeric(hw1_m$fitted),
Fitted_Multiplicative = as.numeric(hw2_m$fitted)
)
# Melt data for easier ggplot usage
library(reshape2)
plot_data_melted <- melt(plot_data, id.vars = "Time",
variable.name = "Series",
value.name = "Value")
# Plot using ggplot2
ggplot(plot_data_melted, aes(x = Time, y = Value, color = Series)) +
geom_point(data = subset(plot_data_melted, Series == "Actual"), size = 2) + # Actual values as dots
geom_line(data = subset(plot_data_melted, Series != "Actual"), size = 1) + # Fitted values as lines
labs(
title = "Actual vs Fitted Values",
x = "Time",
y = "Value",
color = "Series"
) +
scale_color_manual(
values = c("Actual" = "black", "Fitted_Additive" = "blue", "Fitted_Multiplicative" = "red"),
labels = c("Actual", "Fitted (Additive)", "Fitted (Multiplicative)")
) +
theme_minimal() +
theme(
legend.position = "top",
legend.title = element_text(face = "bold")
)
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
Looks like the multiplicative models follows the data more more closely in general.
# residuals
residuals_hw1 <- residuals(hw1_m)
residuals_hw2 <- residuals(hw2_m)
tsdisplay(residuals_hw1)
tsdisplay(residuals_hw2)
# Stationarity and Correlation
# check for stationarity of residuals
# additive
adf_test <- adf.test(residuals_hw1) # H0: series is non-stationary
print(adf_test) # if p-val < alpha, series not stationary
##
## Augmented Dickey-Fuller Test
##
## data: residuals_hw1
## Dickey-Fuller = -1.8495, Lag order = 3, p-value = 0.6317
## alternative hypothesis: stationary
# so with this model we achieve stationary series
# multiplicative
adf_test <- adf.test(residuals_hw2) # H0: series is non-stationary
print(adf_test) # if p-val < alpha, series not stationary
##
## Augmented Dickey-Fuller Test
##
## data: residuals_hw2
## Dickey-Fuller = -2.0941, Lag order = 3, p-value = 0.5365
## alternative hypothesis: stationary
# so with this model we achieve stationary series
# additive
# check for autocorrelation in residuals
Box.test(residuals_hw1, lag = 10, type = "Ljung-Box") # h0 res indep
##
## Box-Ljung test
##
## data: residuals_hw1
## X-squared = 4.8498, df = 10, p-value = 0.901
# p-val > alpha => Dont reject h0, so residuals are indep
# additive
# check for autocorrelation in residuals
Box.test(residuals_hw2, lag = 10, type = "Ljung-Box") # h0 res indep
##
## Box-Ljung test
##
## data: residuals_hw2
## X-squared = 10.493, df = 10, p-value = 0.3983
# p-val > alpha => Dont reject h0, so residuals are indep
Multiplicative model follows the data better and has slightly better residuals
# forecast
# save the forecast of the second model
forecast_hw1 <- forecast(hw1_m, h=12)
forecast_hw2 <- forecast(hw2_m, h=12)
# Forecast plot
# Plot the time series with both forecasts
autoplot(sales_m_ts) +
autolayer(forecast_hw1$mean, series="Additive Holt-Winters Forecast", PI=F) +
autolayer(forecast_hw2$mean, series="Multiplicative Holt-Winters Forecast", PI=F) +
ggtitle("Sales Forecast with Holt-Winters Models") +
xlab("Time") +
ylab("Sales") +
scale_color_manual(
values=c("Additive Holt-Winters Forecast" = "blue",
"Multiplicative Holt-Winters Forecast" = "red")
) +
theme_minimal() +
theme(legend.position = "top", legend.title = element_blank())
## Warning in ggplot2::geom_line(ggplot2::aes(x = .data[["timeVal"]], y = .data[["seriesVal"]], : Ignoring unknown parameters: `PI`
## Ignoring unknown parameters: `PI`
# RMSE Calculation for Holt-Winters models
rmse_hw1 <- calculate_rmse(observed = sales_m_ts, predicted = fitted_hw1)
rmse_hw2 <- calculate_rmse(observed = sales_m_ts, predicted = fitted_hw2)
# Print RMSE values
cat("RMSE for Additive Holt-Winters Model:", rmse_hw1, "\n")
## RMSE for Additive Holt-Winters Model: 14123520
cat("RMSE for Multiplicative Holt-Winters Model:", rmse_hw2, "\n")
## RMSE for Multiplicative Holt-Winters Model: 13169921
Multiplicative model is better
The Holt winters model has a frequency limit of 24, so we cannot do larger than that. Weekly and daily data have 52 and 365 frequencies respectively so we cannot fit the model with the R implementation so far.
ARIMA is a acronym for Auto Regressive Integrated Moving Average, ARIMA (p,d,q) where p refers to the AR part, q refers to the MA part and d is the degree of first difference involved.
First we nwwd to check if the series is stationary
# see if series is stationary
adf.test(sales_m_ts) #H0, series is non-stationary
##
## Augmented Dickey-Fuller Test
##
## data: sales_m_ts
## Dickey-Fuller = -2.655, Lag order = 3, p-value = 0.3183
## alternative hypothesis: stationary
# p-val > 0.05 => dont reject, non stationary: series is not stationary
adf.test(diff(sales_m_ts)) #H0, series is non-stationary
##
## Augmented Dickey-Fuller Test
##
## data: diff(sales_m_ts)
## Dickey-Fuller = -3.3757, Lag order = 3, p-value = 0.07725
## alternative hypothesis: stationary
# see the acf and pacf
tsdisplay(diff(sales_m_ts))
plot(sales_m_ts)
ndiffs(sales_m_ts)
## [1] 1
tsdisplay(diff(sales_m_ts))
Correlogram plot maybe suggests AR-1 or MA-1 after first difference
# ARIMA(p,d,q) = (1,1,0)
arima1_m<- Arima(sales_m_ts, order=c(1,1,0))
summary(arima1_m)
## Series: sales_m_ts
## ARIMA(1,1,0)
##
## Coefficients:
## ar1
## -0.3178
## s.e. 0.1783
##
## sigma^2 = 2.666e+14: log likelihood = -630.5
## AIC=1265 AICc=1265.38 BIC=1268.11
##
## Training set error measures:
## ME RMSE MAE MPE MAPE MASE ACF1
## Training set 4664178 15867282 12399192 6.652001 14.52265 0.3786368 -0.07165611
# study residual to see if is a good model
resid1_m<- residuals(arima1_m)
tsdisplay(resid1_m)
Residuals look stationary after fitting ARIMA
auto_arima_m <- auto.arima(sales_m_ts)
auto_arima_m
## Series: sales_m_ts
## ARIMA(0,1,1) with drift
##
## Coefficients:
## ma1 drift
## -0.3741 3475637
## s.e. 0.1604 1659940
##
## sigma^2 = 2.494e+14: log likelihood = -628.83
## AIC=1263.66 AICc=1264.44 BIC=1268.33
autoplot(forecast(auto_arima_m))
checkresiduals(auto_arima_m)
##
## Ljung-Box test
##
## data: Residuals from ARIMA(0,1,1) with drift
## Q* = 0.42136, df = 6, p-value = 0.9987
##
## Model df: 1. Total lags used: 7
The residuals of the Autoarima look stationary
AIC of the the manual arima is 1265, while the one of the autoarima is 1263. Lets use the autoarima
# Fitted values from both models
fitted_auto_arima <- fitted(auto_arima_m)
fitted_arima1 <- fitted(arima1_m)
# Create a data frame for plotting
plot_data <- data.frame(
Time = time(sales_m_ts),
Actual = as.numeric(sales_m_ts),
Fitted_Auto_ARIMA = as.numeric(fitted_auto_arima),
Fitted_ARIMA1 = as.numeric(fitted_arima1)
)
# Melt the data frame
plot_data_melted <- melt(plot_data, id.vars = "Time",
variable.name = "Series",
value.name = "Value")
# Plot
ggplot(plot_data_melted, aes(x = Time, y = Value, color = Series)) +
geom_point(data = subset(plot_data_melted, Series == "Actual"), size = 2) + # Actual values as points
geom_line(data = subset(plot_data_melted, Series != "Actual"), size = 1) + # Fitted values as lines
labs(
title = "Actual vs Fitted Values for ARIMA Models",
x = "Time",
y = "Sales",
color = "Series"
) +
scale_color_manual(
values = c("Actual" = "black", "Fitted_Auto_ARIMA" = "blue", "Fitted_ARIMA1" = "red"),
labels = c("Actual", "Fitted (Auto ARIMA)", "Fitted (ARIMA(1,1,0))")
) +
theme_minimal() +
theme(
legend.position = "top",
legend.title = element_blank()
)
# Calculate RMSE for the fitted values
# Calculate RMSE for each model
rmse_auto_arima <- calculate_rmse(observed = sales_m_ts, predicted = fitted_auto_arima)
rmse_arima1 <- calculate_rmse(observed = sales_m_ts, predicted = fitted_arima1)
# Print RMSE values
cat("RMSE for Auto ARIMA Model:", rmse_auto_arima, "\n")
## RMSE for Auto ARIMA Model: 15118942
cat("RMSE for ARIMA(1,1,0) Model:", rmse_arima1, "\n")
## RMSE for ARIMA(1,1,0) Model: 15867282
The RMSE of the Autoarima is better as is the AIC.
The ARIMA(0,1,1) model can be described simply as a random walk with drift. Here’s what that means:
AR (AutoRegressive) Part:
I (Integrated) Part:
MA (Moving Average) Part:
An ARIMA(0,1,1) model is suitable when:
The d=1 parameter in ARIMA(0,1,1) indicates that the series is differenced once to achieve stationarity. Before differencing, the series may exhibit a linear trend or random walk behavior. After differencing, the series should show no trend and have relatively stable mean and variance
The q=1 in ARIMA(0,1,1) indicates that the series is modeled with a first-order moving average component after differencing. The autocorrelation function (ACF) of the differenced series should show: A significant spike at lag 1. Rapid decay to zero after lag 1. The partial autocorrelation function (PACF) should show no significant lags.
# study residual to see if is a good model
resid_autoarima_m<- residuals(auto_arima_m)
tsdisplay(resid_autoarima_m)
# see if series is stationary
adf.test(sales_w_ts) #H0, series is non-stationary
##
## Augmented Dickey-Fuller Test
##
## data: sales_w_ts
## Dickey-Fuller = -2.7572, Lag order = 5, p-value = 0.2607
## alternative hypothesis: stationary
# p-val > 0.05 => dont reject, non stationary: series is not stationary
adf.test(diff(sales_w_ts)) # after diff is sationary
## Warning in adf.test(diff(sales_w_ts)): p-value smaller than printed p-value
##
## Augmented Dickey-Fuller Test
##
## data: diff(sales_w_ts)
## Dickey-Fuller = -6.5801, Lag order = 5, p-value = 0.01
## alternative hypothesis: stationary
After differencing, looks stationary
tsdisplay(diff(sales_w_ts))
Correlograms suggest maybe AR 1 or MA 1.
### Manual ARIMA------------
# ARIMA(p,d,q) = (1,1,0)
arima1_w<- Arima(sales_w_ts, order=c(1,1,0))
summary(arima1_w)
## Series: sales_w_ts
## ARIMA(1,1,0)
##
## Coefficients:
## ar1
## -0.4393
## s.e. 0.0742
##
## sigma^2 = 1.161e+13: log likelihood = -2534.49
## AIC=5072.98 AICc=5073.06 BIC=5079.06
##
## Training set error measures:
## ME RMSE MAE MPE MAPE MASE ACF1
## Training set 193180.2 3385012 2537492 -0.1816435 13.03938 0.3404022 -0.04968945
auto_arima_w <- auto.arima(sales_w_ts)
summary(auto_arima_w)
## Series: sales_w_ts
## ARIMA(0,1,1)
##
## Coefficients:
## ma1
## -0.5426
## s.e. 0.0758
##
## sigma^2 = 1.122e+13: log likelihood = -2531.96
## AIC=5067.91 AICc=5067.99 BIC=5073.99
##
## Training set error measures:
## ME RMSE MAE MPE MAPE MASE ACF1
## Training set 335675.7 3328293 2462721 0.6620165 12.38346 0.3303717 -0.01238358
AIC on the Autoarima is better, lets go with that one
checkresiduals(auto_arima_w)
##
## Ljung-Box test
##
## data: Residuals from ARIMA(0,1,1)
## Q* = 17.171, df = 30, p-value = 0.9705
##
## Model df: 1. Total lags used: 31
Residuals look stationary, see the plots for both models
# Fit ARIMA models for weekly data
arima1_w <- Arima(sales_w_ts, order = c(1, 1, 0))
auto_arima_w <- auto.arima(sales_w_ts)
# Extract fitted values for both models
fitted_arima1_w <- fitted(arima1_w)
fitted_auto_arima_w <- fitted(auto_arima_w)
# Create a data frame for plotting
plot_data <- data.frame(
Time = time(sales_w_ts),
Actual = as.numeric(sales_w_ts),
Fitted_ARIMA1 = as.numeric(fitted_arima1_w),
Fitted_Auto_ARIMA = as.numeric(fitted_auto_arima_w)
)
# Melt the data frame for ggplot2
plot_data_melted <- melt(plot_data, id.vars = "Time",
variable.name = "Series",
value.name = "Value")
# Plot using ggplot2
ggplot(plot_data_melted, aes(x = Time, y = Value, color = Series)) +
geom_point(data = subset(plot_data_melted, Series == "Actual"), size = 2) + # Actual values as points
geom_line(data = subset(plot_data_melted, Series != "Actual"), size = 1) + # Fitted values as lines
labs(
title = "Actual vs Fitted Values for ARIMA Models (Weekly)",
x = "Time",
y = "Sales",
color = "Series"
) +
scale_color_manual(
values = c("Actual" = "black", "Fitted_ARIMA1" = "red", "Fitted_Auto_ARIMA" = "blue"),
labels = c("Actual", "Fitted (ARIMA(1,1,0))", "Fitted (Auto ARIMA)")
) +
theme_minimal() +
theme(
legend.position = "top",
legend.title = element_blank()
)
# Calculate RMSE for both models
rmse_arima1_w <- calculate_rmse(observed = sales_w_ts, predicted = fitted_arima1_w)
rmse_auto_arima_w <- calculate_rmse(observed = sales_w_ts, predicted = fitted_auto_arima_w)
# Print RMSE values
cat("RMSE for ARIMA(1,1,0) Model (Weekly):", rmse_arima1_w, "\n")
## RMSE for ARIMA(1,1,0) Model (Weekly): 3385012
cat("RMSE for Auto ARIMA Model (Weekly):", rmse_auto_arima_w, "\n")
## RMSE for Auto ARIMA Model (Weekly): 3328293
The Auto-arima is also better in terms of RMSE
# see if series is stationary
adf.test(sales_d_ts) #H0, series is non-stationary
## Warning in adf.test(sales_d_ts): p-value smaller than printed p-value
##
## Augmented Dickey-Fuller Test
##
## data: sales_d_ts
## Dickey-Fuller = -5.6283, Lag order = 10, p-value = 0.01
## alternative hypothesis: stationary
# p-val < 0.05 => reject non stationary: series might be stationary
No need for differencing because is already stationary, try to model with arima
tsdisplay(sales_d_ts)
Autocorrelograms are not easy to interpret, but lets try with a baseline model
# ARIMA(p,d,q) = (2,1,0)
arima1_d<- Arima(sales_d_ts, order=c(1,0,1))
summary(arima1_d)
## Series: sales_d_ts
## ARIMA(1,0,1) with non-zero mean
##
## Coefficients:
## ar1 ma1 mean
## 0.3253 0.3666 3320965.47
## s.e. 0.0434 0.0400 92676.64
##
## sigma^2 = 2.184e+12: log likelihood = -16248.67
## AIC=32505.35 AICc=32505.39 BIC=32525.14
##
## Training set error measures:
## ME RMSE MAE MPE MAPE MASE ACF1
## Training set 545.5599 1475571 1180971 -110.111 132.9826 0.5165637 0.0150054
checkresiduals(arima1_d)
##
## Ljung-Box test
##
## data: Residuals from ARIMA(1,0,1) with non-zero mean
## Q* = 2743.8, df = 206, p-value < 2.2e-16
##
## Model df: 2. Total lags used: 208
Residuals look not entirely stationary
Try to model with automatic approach:
auto_arima_d <- auto.arima(sales_d_ts)
summary(auto_arima_d)
## Series: sales_d_ts
## ARIMA(5,1,5) with drift
##
## Coefficients:
## ar1 ar2 ar3 ar4 ar5 ma1 ma2 ma3
## 0.9544 -1.5459 1.0021 -1.0850 0.1718 -1.6378 1.8605 -1.7958
## s.e. 0.0587 0.0493 0.0822 0.0449 0.0505 0.0495 0.0774 0.0845
## ma4 ma5 drift
## 1.3246 -0.6504 3652.866
## s.e. 0.0733 0.0363 2319.631
##
## sigma^2 = 1.213e+12: log likelihood = -15926.04
## AIC=31876.08 AICc=31876.38 BIC=31935.43
##
## Training set error measures:
## ME RMSE MAE MPE MAPE MASE
## Training set -1608.767 1094980 810378.5 -49.07305 67.93344 0.3544643
## ACF1
## Training set 0.003023956
checkresiduals(auto_arima_d)
##
## Ljung-Box test
##
## data: Residuals from ARIMA(5,1,5) with drift
## Q* = 189.3, df = 198, p-value = 0.6592
##
## Model df: 10. Total lags used: 208
Rresiduals improve, and AIC is lower in the autoarima Check the fit for both models
# Extract fitted values for both models
fitted_arima1_d <- fitted(arima1_d)
fitted_auto_arima_d <- fitted(auto_arima_d)
# Create a data frame for plotting
plot_data <- data.frame(
Time = time(sales_d_ts),
Actual = as.numeric(sales_d_ts),
Fitted_ARIMA1 = as.numeric(fitted_arima1_d),
Fitted_Auto_ARIMA = as.numeric(fitted_auto_arima_d)
)
# Melt the data frame for ggplot2
plot_data_melted <- melt(plot_data, id.vars = "Time",
variable.name = "Series",
value.name = "Value")
# Plot using ggplot2
ggplot(plot_data_melted, aes(x = Time, y = Value, color = Series)) +
geom_point(data = subset(plot_data_melted, Series == "Actual"), size = 2) + # Actual values as points
geom_line(data = subset(plot_data_melted, Series != "Actual"), size = 1) + # Fitted values as lines
labs(
title = "Actual vs Fitted Values for ARIMA Models (Daily)",
x = "Time",
y = "Sales",
color = "Series"
) +
scale_color_manual(
values = c("Actual" = "black", "Fitted_ARIMA1" = "red", "Fitted_Auto_ARIMA" = "blue"),
labels = c("Actual", "Fitted (ARIMA(1,0,1))", "Fitted (Auto ARIMA)")
) +
theme_minimal() +
theme(
legend.position = "top",
legend.title = element_blank()
)
Plot is not readable, but check the RMSE for both models to confirm wihch fits better
# Calculate RMSE for both models
rmse_arima1_d <- calculate_rmse(observed = sales_d_ts, predicted = fitted_arima1_d)
rmse_auto_arima_d <- calculate_rmse(observed = sales_d_ts, predicted = fitted_auto_arima_d)
# Print RMSE values
cat("RMSE for ARIMA(1,0,1) Model (Daily):", rmse_arima1_d, "\n")
## RMSE for ARIMA(1,0,1) Model (Daily): 1475571
cat("RMSE for Auto ARIMA Model (Daily):", rmse_auto_arima_d, "\n")
## RMSE for Auto ARIMA Model (Daily): 1094980
Autoarima is much better, now try to improve with seasonality, beacuse daily data looks seasonal each 7 days.
# Daily sales
tsdisplay(sales_d_ts) #
tsdisplay(diff(sales_d_ts))
sarima_d <- auto.arima(sales_d_ts, seasonal=TRUE)
summary(sarima_d)
## Series: sales_d_ts
## ARIMA(5,1,5) with drift
##
## Coefficients:
## ar1 ar2 ar3 ar4 ar5 ma1 ma2 ma3
## 0.9544 -1.5459 1.0021 -1.0850 0.1718 -1.6378 1.8605 -1.7958
## s.e. 0.0587 0.0493 0.0822 0.0449 0.0505 0.0495 0.0774 0.0845
## ma4 ma5 drift
## 1.3246 -0.6504 3652.866
## s.e. 0.0733 0.0363 2319.631
##
## sigma^2 = 1.213e+12: log likelihood = -15926.04
## AIC=31876.08 AICc=31876.38 BIC=31935.43
##
## Training set error measures:
## ME RMSE MAE MPE MAPE MASE
## Training set -1608.767 1094980 810378.5 -49.07305 67.93344 0.3544643
## ACF1
## Training set 0.003023956
resid_ds<- residuals(sarima_d)
tsdisplay(resid_ds)
# check for autocorrelation
Box.test(residuals(sarima_d), type="Ljung-Box")
##
## Box-Ljung test
##
## data: residuals(sarima_d)
## X-squared = 0.0095375, df = 1, p-value = 0.9222
# A low p-value (<0.05) suggests residual autocorrelation.
Looks like Aarima is the same in terms of AIC, lets check the RMSE:
# Extract fitted values for both models
fitted_sarima_d <- fitted(sarima_d)
# Calculate RMSE for both models
rmse_sarima_d <- calculate_rmse(observed = sales_d_ts, predicted = fitted_sarima_d)
# Print RMSE values
cat("RMSE for Auto ARIMA Model (Daily):", rmse_auto_arima_d, "\n")
## RMSE for Auto ARIMA Model (Daily): 1094980
cat("RMSE for Seasonal ARIMA Model (Daily):", rmse_sarima_d, "\n")
## RMSE for Seasonal ARIMA Model (Daily): 1094980
The RMSE is exactly the same, they are the same model.
Refine SARIMA with external regressors
# readefine sales_d_ts
head(df_merged_d)
sales_d_ts <- ts(exp(df_merged_d$sales_cop), frequency=365, start=c(2021, 334)) # 334 is November 30
seasonal_sales_d_ts <- ts(exp(df_merged_d$sales_cop), frequency=7, start=c(2021, 334)) # 334 is November 30
plot(sales_d_ts)
tsdisplay(sales_d_ts,lag.max = 30)
tsdisplay(seasonal_sales_d_ts,lag.max = 30)
# define regresors
# Select specific columns by name
x_regressors_d <- df_merged_d %>% select(rain_sum, fx, tmedian)
# Apply the exponential function to each column
x_regressors_d <- as.data.frame(apply(x_regressors_d, 2, exp))
# Convert to a matrix for ARIMA modeling
x_regressors_d <- as.matrix(x_regressors_d)
# fit the model on sales
# Fit an auto.arima model with seasonal component and external regressors
sarimax_model_d <- auto.arima(
sales_d_ts,
seasonal = TRUE, # Enable seasonal components
xreg = x_regressors_d # External regressors
)
# Display the summary of the fitted model
summary(sarimax_model_d)
## Series: sales_d_ts
## Regression with ARIMA(5,1,5) errors
##
## Coefficients:
## ar1 ar2 ar3 ar4 ar5 ma1 ma2 ma3
## 0.9230 -1.5215 0.9550 -1.0569 0.1394 -1.6554 1.8823 -1.8168
## s.e. 0.0605 0.0502 0.0831 0.0448 0.0509 0.0516 0.0805 0.0836
## ma4 ma5 drift rain_sum fx tmedian
## 1.3447 -0.6535 3880.584 38.8494 -375.8824 62715.8
## s.e. 0.0730 0.0373 2346.036 112.5507 383.7332 30430.2
##
## sigma^2 = 1.333e+12: log likelihood = -15973.62
## AIC=31977.25 AICc=31977.72 BIC=32051.44
##
## Training set error measures:
## ME RMSE MAE MPE MAPE MASE ACF1
## Training set -2264.148 1146332 833227 -3414423 3414442 0.3598772 0.000552024
The AIC actually decreases, lets check the RMSE
# Extract fitted values for all models
fitted_sarimax_d <- fitted(sarimax_model_d)
# Calculate RMSE for all models
rmse_sarimax_d <- calculate_rmse(observed = sales_d_ts, predicted = fitted_sarimax_d)
# Print RMSE values
cat("RMSE for Auto ARIMA Model (Daily):", rmse_auto_arima_d, "\n")
## RMSE for Auto ARIMA Model (Daily): 1094980
cat("RMSE for Seasonal ARIMA Model (Daily):", rmse_sarima_d, "\n")
## RMSE for Seasonal ARIMA Model (Daily): 1094980
cat("RMSE for SARIMAX Model (Daily):", rmse_sarimax_d, "\n")
## RMSE for SARIMAX Model (Daily): 1146332
The RMSE also worsens, so stay with regular Auto-ARIMA
Monthly Sales
fit_m1 <- ses(sales_m_ts, alpha = 0.2, initial = 'simple', h=5)
fit_m2 <- ses(sales_m_ts, alpha = 0.6, initial = 'simple', h=5)
fit_m3 <- ses(sales_m_ts, h=5)
plot(sales_m_ts, ylab='Monthly Sales', xlab='Months')
lines(fitted(fit_m1), col='blue', type='o')
lines(fitted(fit_m2), col='red', type='o')
lines(fitted(fit_m3), col='green', type='o')
forecast_m1 <- ses(sales_m_ts, h=5)
# Accuracy of one-step-ahead training errors
round(accuracy(forecast_m1),2)
## ME RMSE MAE MPE MAPE MASE ACF1
## Training set 3207582 17626876 13250973 -12.1 29.7 0.4 -0.05
summary(forecast_m1)
##
## Forecast method: Simple exponential smoothing
##
## Model Information:
## Simple exponential smoothing
##
## Call:
## ses(y = sales_m_ts, h = 5)
##
## Smoothing parameters:
## alpha = 0.675
##
## Initial states:
## l = 56978123.3871
##
## sigma: 18137906
##
## AIC AICc BIC
## 1336.322 1337.072 1341.073
##
## Error measures:
## ME RMSE MAE MPE MAPE MASE ACF1
## Training set 3207582 17626876 13250973 -12.09656 29.70395 0.4046478 -0.0504105
##
## Forecasts:
## Point Forecast Lo 80 Hi 80 Lo 95 Hi 95
## Nov 2024 134921433 111676771 158166094 99371791 170471075
## Dec 2024 134921433 106877020 162965845 92031205 177811660
## Jan 2025 134921433 102786345 167056520 85775057 184067808
## Feb 2025 134921433 99160579 170682286 80229927 189612939
## Mar 2025 134921433 95870012 173972853 75197439 194645426
autoplot(forecast_m1) + autolayer(fitted(forecast_m1),series='Fitted') + ylab("Monthly Sales")+xlab("Months")
# Extract fitted values for each model
fitted_m1 <- fitted(fit_m1)
fitted_m2 <- fitted(fit_m2)
fitted_m3 <- fitted(fit_m3)
# Calculate RMSE for each model
rmse_m1 <- calculate_rmse(observed = sales_m_ts, predicted = fitted_m1)
rmse_m2 <- calculate_rmse(observed = sales_m_ts, predicted = fitted_m2)
rmse_m3 <- calculate_rmse(observed = sales_m_ts, predicted = fitted_m3)
# Print RMSE values
cat("RMSE for SES Model 1 (alpha = 0.2):", rmse_m1, "\n")
## RMSE for SES Model 1 (alpha = 0.2): 23924620
cat("RMSE for SES Model 2 (alpha = 0.6):", rmse_m2, "\n")
## RMSE for SES Model 2 (alpha = 0.6): 16189623
cat("RMSE for SES Model 3 (Optimized alpha):", rmse_m3, "\n")
## RMSE for SES Model 3 (Optimized alpha): 17626876
rmse_exp_sm_m <- rmse_m2
Weekly Sales
For weekly data, exponential smoothing can capture longer-term trends and seasonal patterns that repeat on a weekly basis. Weekly data can also have seasonal components related to months, quarters, or years.
fit_w1 <- ses(sales_w_ts, alpha = 0.2, initial = 'simple', h=5)
fit_w2 <- ses(sales_w_ts, alpha = 0.6, initial = 'simple', h=5)
fit_w3 <- ses(sales_w_ts, h=5)
plot(sales_w_ts, ylab='Weekly Sales', xlab='Weeks')
lines(fitted(fit_w1), col='blue', type='o')
lines(fitted(fit_w2), col='red', type='o')
lines(fitted(fit_w3), col='green', type='o')
forecast_w1 <- ses(sales_w_ts, h=5)
round(accuracy(forecast_w1),2)
## ME RMSE MAE MPE MAPE MASE ACF1
## Training set 306073.8 3331847 2478469 -0.46 13.08 0.33 -0.01
summary(forecast_w1)
##
## Forecast method: Simple exponential smoothing
##
## Model Information:
## Simple exponential smoothing
##
## Call:
## ses(y = sales_w_ts, h = 5)
##
## Smoothing parameters:
## alpha = 0.4502
##
## Initial states:
## l = 5339607.7196
##
## sigma: 3353553
##
## AIC AICc BIC
## 5443.633 5443.791 5452.763
##
## Error measures:
## ME RMSE MAE MPE MAPE MASE
## Training set 306073.8 3331847 2478469 -0.4636447 13.08087 0.3324843
## ACF1
## Training set -0.006458446
##
## Forecasts:
## Point Forecast Lo 80 Hi 80 Lo 95 Hi 95
## 2024.865 26697597 22399845 30995348 20124753 33270441
## 2024.885 26697597 21984397 31410797 19489379 33905814
## 2024.904 26697597 21602713 31792480 18905645 34489549
## 2024.923 26697597 21247696 32147498 18362692 35032501
## 2024.942 26697597 20914431 32480762 17853008 35542186
autoplot(forecast_w1) + autolayer(fitted(forecast_w1),series='Fitted') + ylab("Weekly Sales")+xlab("Weeks")
# Extract fitted values for each model
fitted_w1 <- fitted(fit_w1)
fitted_w2 <- fitted(fit_w2)
fitted_w3 <- fitted(fit_w3)
# Calculate RMSE for each model
rmse_w1 <- calculate_rmse(observed = sales_w_ts, predicted = fitted_w1)
rmse_w2 <- calculate_rmse(observed = sales_w_ts, predicted = fitted_w2)
rmse_w3 <- calculate_rmse(observed = sales_w_ts, predicted = fitted_w3)
# Print RMSE values
cat("RMSE for SES Model 1 (alpha = 0.2):", rmse_w1, "\n")
## RMSE for SES Model 1 (alpha = 0.2): 3564863
cat("RMSE for SES Model 2 (alpha = 0.6):", rmse_w2, "\n")
## RMSE for SES Model 2 (alpha = 0.6): 3363852
cat("RMSE for SES Model 3 (Optimized alpha):", rmse_w3, "\n")
## RMSE for SES Model 3 (Optimized alpha): 3331847
rmse_exp_sm_w <- rmse_w3
Daily Sales
For daily data, exponential smoothing can be used to forecast short-term trends and seasonal patterns. When applying exponential smoothing to daily data, you need to consider:
Seasonality: Daily data often exhibit seasonal patterns, such as weekly cycles (e.g., higher sales on weekends).
Holidays and special events: These can cause irregular patterns in daily data that may need to be accounted for.
fit_d1 <- ses(sales_d_ts, alpha = 0.2, initial = 'simple', h=5)
fit_d2 <- ses(sales_d_ts, alpha = 0.6, initial = 'simple', h=5)
fit_d3 <- ses(sales_d_ts, h=5)
plot(sales_d_ts, ylab='Daily Sales', xlab='Days')
lines(fitted(fit_d1), col='blue', type='o')
lines(fitted(fit_d2), col='red', type='o')
lines(fitted(fit_d3), col='green', type='o')
forecast_d1 <- ses(sales_d_ts, h=5)
round(accuracy(forecast_d1),2)
## ME RMSE MAE MPE MAPE MASE ACF1
## Training set 91166.44 1642520 1279475 -3521871 3521898 0.55 0.38
summary(forecast_d1)
##
## Forecast method: Simple exponential smoothing
##
## Model Information:
## Simple exponential smoothing
##
## Call:
## ses(y = sales_d_ts, h = 5)
##
## Smoothing parameters:
## alpha = 0.0372
##
## Initial states:
## l = 863041.8751
##
## sigma: 1644102
##
## AIC AICc BIC
## 36999.28 36999.30 37014.12
##
## Error measures:
## ME RMSE MAE MPE MAPE MASE ACF1
## Training set 91166.44 1642520 1279475 -3521871 3521898 0.5526151 0.3755302
##
## Forecasts:
## Point Forecast Lo 80 Hi 80 Lo 95 Hi 95
## 2024.7616 4388896 2281895 6495897 1166516 7611277
## 2024.7644 4388896 2280439 6497354 1164288 7613504
## 2024.7671 4388896 2278983 6498809 1162063 7615730
## 2024.7699 4388896 2277529 6500264 1159838 7617954
## 2024.7726 4388896 2276075 6501717 1157616 7620177
autoplot(forecast_d1) + autolayer(fitted(forecast_d1),series='Fitted') + ylab("Daily Sales")+xlab("Days")
# Extract fitted values for each model
fitted_d1 <- fitted(fit_d1)
fitted_d2 <- fitted(fit_d2)
fitted_d3 <- fitted(fit_d3)
# Calculate RMSE for each model
rmse_d1 <- calculate_rmse(observed = sales_d_ts, predicted = fitted_d1)
rmse_d2 <- calculate_rmse(observed = sales_d_ts, predicted = fitted_d2)
rmse_d3 <- calculate_rmse(observed = sales_d_ts, predicted = fitted_d3)
# Print RMSE values
cat("RMSE for SES Model 1 (alpha = 0.2):", rmse_d1, "\n")
## RMSE for SES Model 1 (alpha = 0.2): 1714231
cat("RMSE for SES Model 2 (alpha = 0.6):", rmse_d2, "\n")
## RMSE for SES Model 2 (alpha = 0.6): 1803005
cat("RMSE for SES Model 3 (Optimized alpha):", rmse_d3, "\n")
## RMSE for SES Model 3 (Optimized alpha): 1642520
rmse_exp_sm_d <- rmse_d3
# 1. GGM on sales_m_ts
summary(ggm1)
## Call: ( Guseo Guidolin Model )
##
## GGM(series = sales_m_ts, mt = "base", display = T)
##
## Residuals:
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -27544719 -8809035 742251 -337148 7701034 23238738
##
## Coefficients:
## Estimate Std.Error Lower Upper p-value
## K 9.208740e+09 2.454321e+09 4.398360e+09 1.401912e+10 7.24e-04 ***
## pc 2.617639e-02 3.002777e-03 2.029106e-02 3.206173e-02 7.64e-10 ***
## qc 2.453426e-01 3.079890e-02 1.849779e-01 3.057074e-01 5.41e-09 ***
## ps 6.782088e-03 1.329165e-03 4.176972e-03 9.387204e-03 1.60e-05 ***
## qs 3.849713e-02 8.943356e-03 2.096848e-02 5.602579e-02 1.56e-04 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error 13318300 on 31 degrees of freedom
## Multiple R-squared: 0.999873 Residual sum of squares: 5.498691e+15
# 2. Get the instantaneous fitted values of the GGM
fitted_ggm_cum <- fitted(ggm1)
fitted_ggm_instant <- make.instantaneous(fitted_ggm_cum)
# 3. Auto-SARIMAX on cumsum(sales_m_ts), with xreg = instantaneous GGM
# First, create the cumulative sales
cumulative_sales_m_ts <- cumsum(sales_m_ts)
sarima_m <- auto.arima(
cumulative_sales_m_ts,
xreg = fitted_ggm_instant,
seasonal = TRUE,
stepwise = FALSE,
approximation= FALSE
)
# 4. Extract the fitted (cumulative) values of the SARIMAX
fitted_cum_sarima <- fitted(sarima_m)
# 5. Convert SARIMAX fitted cumulative back to instantaneous
# One simple approach:
fitted_instant_sarima <- diff(c(0, fitted_cum_sarima))
# Now fitted_instant_sarima should have the same length as sales_m_ts.
# Turn it into a time series object for neat plotting
fitted_instant_sarima_ts <- ts(
fitted_instant_sarima,
start = start(sales_m_ts),
frequency = frequency(sales_m_ts)
)
# 6. Plot the actual monthly sales and the SARIMAX instantaneous fit
plot(
sales_m_ts,
type="o",
col="black",
lwd=2,
main="Actual Monthly Sales vs. SARIMAX Fitted (Instantaneous)",
xlab="Time",
ylab="Monthly Sales"
)
lines(
fitted_instant_sarima_ts,
col="red",
lty=2,
lwd=2
)
legend(
"bottomright",
legend = c("Actual Sales", "SARIMAX Fit (Instantaneous)"),
col = c("black", "red"),
lty = c(1, 2),
lwd = c(2, 2)
)
# Calculate RMSE for fitted_instantaneous_ts against sales_m_ts
rmse_mixture_m <- calculate_rmse(observed = sales_m_ts, predicted = fitted_instant_sarima_ts)
# Print the RMSE value
cat("RMSE for Fitted Instantaneous Values (GGM + SARIMAX):", rmse_mixture_m, "\n")
## RMSE for Fitted Instantaneous Values (GGM + SARIMAX): 20987648
resid_mixture_m <- sales_m_ts - fitted_instant_sarima_ts
tsdisplay(resid_mixture_m)
Residuals have autocorrelation at lag 1
# 1. GGM on sales_w_ts
summary(ggm1_w)
## Call: ( Guseo Guidolin Model )
##
## GGM(series = sales_w_ts, mt = "base", display = T)
##
## Residuals:
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -30941134 -8439496 1818273 88704 8469414 21113528
##
## Coefficients:
## Estimate Std.Error Lower Upper p-value
## K 6.454454e+09 5.261291e+08 5.423260e+09 7.485648e+09 2.13e-24 ***
## pc 3.542302e-04 2.760784e-05 3.001199e-04 4.083406e-04 6.66e-26 ***
## qc 2.078354e-02 1.326387e-03 1.818387e-02 2.338321e-02 2.16e-33 ***
## ps 1.149074e-02 3.238499e-04 1.085601e-02 1.212547e-02 7.53e-75 ***
## qs 3.110864e-02 2.164583e-03 2.686613e-02 3.535114e-02 5.35e-30 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error 12271825 on 150 degrees of freedom
## Multiple R-squared: 0.999876 Residual sum of squares: 2.258965e+16
# 2. Get the instantaneous fitted values of the GGM
fitted_ggm_cum <- fitted(ggm1_w) # Cumulative fit from GGM
fitted_ggm_instant <- make.instantaneous(fitted_ggm_cum) # Convert to weekly flow
# 3. Auto-SARIMAX on cumsum(sales_w_ts), with xreg = instantaneous GGM
# Create the cumulative weekly series
cumulative_sales_w_ts <- cumsum(sales_w_ts)
# Fit an auto.arima with the GGM instantaneous series as xreg
sarima_w <- auto.arima(
cumulative_sales_w_ts,
xreg = fitted_ggm_instant,
seasonal = TRUE,
stepwise = FALSE,
approximation= FALSE
)
# 4. Extract the fitted (cumulative) values of the SARIMAX
fitted_cum_sarima <- fitted(sarima_w)
# This is the model's in-sample one-step-ahead *cumulative* fit.
# 5. Convert SARIMAX fitted cumulative back to instantaneous
fitted_instant_sarima <- diff(c(0, fitted_cum_sarima))
# Turn it into a time series object
fitted_instant_sarima_ts <- ts(
fitted_instant_sarima,
start = start(sales_w_ts),
frequency = frequency(sales_w_ts)
)
# 6. Plot the actual weekly sales and the SARIMAX instantaneous fit
plot(
sales_w_ts,
type = "o",
col = "black",
lwd = 2,
main = "Actual Weekly Sales vs. SARIMAX Fitted (Instantaneous)",
xlab = "Time (Weeks)",
ylab = "Weekly Sales"
)
lines(
fitted_instant_sarima_ts,
col = "red",
lty = 2,
lwd = 2
)
legend(
"bottomright",
legend = c("Actual Weekly Sales", "SARIMAX Fit (Instantaneous)"),
col = c("black", "red"),
lty = c(1, 2),
lwd = c(2, 2)
)
# Residuals
# Step 1: Extract residuals from the SARIMA model
resid_w <- residuals(sarima_w)
# Step 2: Visualize residuals
# Time series plot of residuals
tsdisplay(resid_w, main = "Residual Diagnostics for SARIMA Model")
# Step 3: Test residuals for stationarity
adf_test <- adf.test(resid_w)
## Warning in adf.test(resid_w): p-value smaller than printed p-value
cat("ADF Test p-value:", adf_test$p.value, "\n")
## ADF Test p-value: 0.01
if (adf_test$p.value < 0.05) {
cat("The residuals are stationary.\n")
} else {
cat("The residuals are not stationary.\n")
}
## The residuals are stationary.
# Step 4: Test residuals for white noise (no autocorrelation)
ljung_box_test <- Box.test(resid_w, lag = 20, type = "Ljung-Box")
cat("Ljung-Box Test p-value:", ljung_box_test$p.value, "\n")
## Ljung-Box Test p-value: 0.8993948
if (ljung_box_test$p.value > 0.05) {
cat("The residuals resemble white noise (uncorrelated).\n")
} else {
cat("The residuals show significant autocorrelation.\n")
}
## The residuals resemble white noise (uncorrelated).
Stationary residuals but with significant correlation
#### RMSE for SARIMAX Predictions ####
rmse_mixture_w <- calculate_rmse(observed = sales_w_ts, predicted = fitted_instant_sarima_ts)
# Print RMSE for SARIMAX
cat("RMSE for SARIMAX Predictions:", rmse_mixture_w, "\n")
## RMSE for SARIMAX Predictions: 4567633
# 1. GGM on sales_d_ts
summary(ggm1_d)
## Call: ( Guseo Guidolin Model )
##
## GGM(series = sales_scaled, mt = "base", display = T)
##
## Residuals:
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -3.40144 -0.89220 0.18363 0.04369 0.98854 2.48411
##
## Coefficients:
## Estimate Std.Error Lower Upper p-value
## K 1.174310e+03 4.487889e+02 294.699957932 2.053920e+03 9.01e-03 **
## pc 2.384956e-05 1.641538e-05 -0.000008324 5.602311e-05 1.47e-01
## qc 2.113894e-03 1.971344e-04 0.001727518 2.500270e-03 1.64e-25 ***
## ps 1.722612e-03 5.791737e-05 0.001609096 1.836128e-03 5.37e-141 ***
## qs 2.948140e-03 1.218250e-04 0.002709367 3.186912e-03 6.84e-103 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error 1.202014 on 1035 degrees of freedom
## Multiple R-squared: 0.999862 Residual sum of squares: 1495.407
# 2. Get the instantaneous fitted values of the GGM
fitted_ggm_cum <- fitted(ggm1_d) # Cumulative fit from GGM
fitted_ggm_instant <- make.instantaneous(fitted_ggm_cum) # Convert to daily flow
# 3. Auto-SARIMAX on cumsum(sales_d_ts), with xreg = instantaneous GGM
# Create the cumulative daily series
cumulative_sales_d_ts <- cumsum(sales_d_ts)
# first scale data so it can converge
cumulative_sales_d_ts <- scale(cumulative_sales_d_ts)
fitted_ggm_instant <- scale(fitted_ggm_instant)
# Scale the data
cumulative_sales_center <- attr(cumulative_sales_d_ts, "scaled:center")
cumulative_sales_scale <- attr(cumulative_sales_d_ts, "scaled:scale")
ggm_instant_center <- attr(fitted_ggm_instant, "scaled:center")
ggm_instant_scale <- attr(fitted_ggm_instant, "scaled:scale")
sarima_d <- auto.arima(
cumulative_sales_d_ts,
xreg = fitted_ggm_instant,
stepwise = TRUE,
approximation = TRUE
)
# 4. Extract the fitted (cumulative) values of the SARIMAX
# Fitted cumulative values (scaled)
fitted_cum_sarima <- fitted(sarima_d)
# Scale back to original cumulative values
fitted_cum_sarima_original <- fitted_cum_sarima * cumulative_sales_scale + cumulative_sales_center
# Convert cumulative fitted to instantaneous fitted (original units)
fitted_instant_sarima_original <- diff(c(0, fitted_cum_sarima_original))
# Create time series object for instantaneous fitted values
fitted_instant_sarima_original_ts <- ts(
fitted_instant_sarima_original,
start = start(sales_d_ts),
frequency = frequency(sales_d_ts)
)
# Plot the results
plot(
sales_d_ts,
type = "o",
col = "black",
lwd = 2,
main = "Actual Daily Sales vs. SARIMAX Fitted (Instantaneous)",
xlab = "Time (Days)",
ylab = "Daily Sales"
)
lines(
fitted_instant_sarima_original_ts,
col = "red",
lty = 2,
lwd = 2
)
legend(
"bottomright",
legend = c("Actual Daily Sales", "SARIMAX Fit (Instantaneous)"),
col = c("black", "red"),
lty = c(1, 2),
lwd = c(2, 2)
)
#### Residuals-----------------------
# Extract residuals from the SARIMA model
resid_d <- residuals(sarima_d)
# Visualize residuals
tsdisplay(resid_d, main = "Residual Diagnostics for SARIMA Model")
# Test residuals for stationarity
adf_test <- adf.test(resid_d)
## Warning in adf.test(resid_d): p-value smaller than printed p-value
cat("ADF Test p-value:", adf_test$p.value, "\n")
## ADF Test p-value: 0.01
if (adf_test$p.value < 0.05) {
cat("The residuals are stationary.\n")
} else {
cat("The residuals are not stationary.\n")
}
## The residuals are stationary.
# Test residuals for white noise (no autocorrelation)
ljung_box_test <- Box.test(resid_d, lag = 20, type = "Ljung-Box")
cat("Ljung-Box Test p-value:", ljung_box_test$p.value, "\n")
## Ljung-Box Test p-value: 0.5910185
if (ljung_box_test$p.value > 0.05) {
cat("The residuals resemble white noise (uncorrelated).\n")
} else {
cat("The residuals show significant autocorrelation.\n")
}
## The residuals resemble white noise (uncorrelated).
#### RMSE for SARIMAX Predictions ####
rmse_mixture_d <- calculate_rmse(observed = sales_d_ts, predicted = fitted_instant_sarima_original_ts)
# Print RMSE for SARIMAX
cat("RMSE for SARIMAX Predictions:", rmse_mixture_d, "\n")
## RMSE for SARIMAX Predictions: 1629664
This model was introduced by Facebook (S. J. Taylor & Letham, 2018), originally for forecasting daily data with weekly and yearly seasonality, plus holiday effects. It was later extended to cover more types of seasonal data. It works best with time series that have strong seasonality and several seasons of historical data.
Prophet can be considered a nonlinear regression model (Chapter 7), of the form yt=g(t)+s(t)+h(t)+εt, where g(t) describes a piecewise-linear trend (or “growth term”), s(t) describes the various seasonal patterns, h(t) captures the holiday effects, and εt is a white noise error term.
The knots (or changepoints) for the piecewise-linear trend are automatically selected if not explicitly specified. Optionally, a logistic function can be used to set an upper bound on the trend.
The seasonal component consists of Fourier terms of the relevant periods. By default, order 10 is used for annual seasonality and order 3 is used for weekly seasonality.
Holiday effects are added as simple dummy variables.
The model is estimated using a Bayesian approach to allow for automatic selection of the changepoints and other model characteristics.
library(prophet)
The input to Prophet is always a dataframe with two columns: ds and y . The ds (datestamp) column should be of a format, ideally YYYY-MM-DD for a date or YYYY-MM-DD HH:MM:SS for a timestamp. The y column must be numeric, and represents the measurement we wish to forecast.
# sales montly
ggplot(df_merged_m, aes(x=month, y=sales_m)) +
geom_line() + ggtitle("Monthly Sales of Restaurant")
head(df_merged_m)
#Prophet model
# model with no seasonality
df_prophet_m <- df_merged_m[1:2]
head(df_prophet_m)
colnames(df_prophet_m) = c("ds", "y")
df_prophet_m$y <- exp(df_prophet_m$y)
prophet_sales_m <- prophet(df_prophet_m)
## Disabling weekly seasonality. Run prophet with weekly.seasonality=TRUE to override this.
## Disabling daily seasonality. Run prophet with daily.seasonality=TRUE to override this.
head(df_prophet_m)
# Step 2: Create a future dataframe for the next 14 months
future_sales_m <- make_future_dataframe(
prophet_sales_m,
periods = 14, # Forecast for 14 months
freq = 'month', # Monthly frequency
include_history = TRUE # Include historical data in the future dataframe
)
tail(future_sales_m)
forecast_sales_m <- predict(prophet_sales_m, future_sales_m)
tail(forecast_sales_m[c('ds', 'yhat', 'yhat_lower', 'yhat_upper')])
plot(prophet_sales_m, forecast_sales_m)
prophet_plot_components(prophet_sales_m, forecast_sales_m)
dyplot.prophet(prophet_sales_m, forecast_sales_m)
## Warning: `select_()` was deprecated in dplyr 0.7.0.
## ℹ Please use `select()` instead.
## ℹ The deprecated feature was likely used in the prophet package.
## Please report the issue at <https://github.com/facebook/prophet/issues>.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
#Use the original dataframe to get the fitted values
fitted_values <- predict(prophet_sales_m, df_prophet_m)
# Extract the fitted values (column 'yhat' contains the fitted values)
fitted_y <- fitted_values$yhat
# Calculate RMSE
actual_y <- df_prophet_m$y # Actual sales values
rmse_prophet_m <- calculate_rmse(observed = actual_y, predicted = fitted_y)
# Print RMSE
cat("RMSE for Prophet Fitted Values:", rmse_prophet_m, "\n")
## RMSE for Prophet Fitted Values: 16786939
Residuals for prophet
# Calculate Residuals
residuals_prophet <- actual_y - fitted_y # Residuals = Actual - Fitted
# Visualize Residuals using tsdisplay
tsdisplay(residuals_prophet, main = "Residual Diagnostics for Prophet Model")
# Perform ADF Test for Stationarity
adf_test <- adf.test(residuals_prophet)
cat("ADF Test p-value:", adf_test$p.value, "\n")
## ADF Test p-value: 0.3278912
if (adf_test$p.value < 0.05) {
cat("Residuals are stationary (reject H0).\n")
} else {
cat("Residuals are not stationary (fail to reject H0).\n")
}
## Residuals are not stationary (fail to reject H0).
# Perform Serial Correlation Test
ljung_box_test <- Box.test(residuals_prophet, lag = 10, type = "Ljung-Box")
cat("Ljung-Box Test p-value:", ljung_box_test$p.value, "\n")
## Ljung-Box Test p-value: 6.527107e-10
if (ljung_box_test$p.value > 0.05) {
cat("Residuals resemble white noise (no significant autocorrelation).\n")
} else {
cat("Residuals show significant autocorrelation.\n")
}
## Residuals show significant autocorrelation.
ggplot(df_merged_w, aes(x=week, y=sales_w)) +
geom_line() + ggtitle("Weekly Sales of Restaurant")
head(df_merged_w)
#Prophet model
# model with no seasonality
df_prophet_w <- df_merged_w[1:2]
colnames(df_prophet_w) = c("ds", "y")
df_prophet_w$y <- exp(df_prophet_w$y)
df_prophet_w
prophet_sales_w <- prophet(df_prophet_w)
## Disabling weekly seasonality. Run prophet with weekly.seasonality=TRUE to override this.
## Disabling daily seasonality. Run prophet with daily.seasonality=TRUE to override this.
Predictions are made on a dataframe with a column ds
containing the dates for which predictions are to be made. The
make_future_dataframe function takes the model object and a
number of periods to forecast and produces a suitable dataframe. By
default it will also include the historical dates so we can evaluate
in-sample fit.
future_sales_w <- make_future_dataframe(prophet_sales_w,
periods = 52,
freq = 'week',
include_history = T)
tail(future_sales_w)
As with most modeling procedures in R, we use the generic
predict function to get our forecast. The
forecast object is a dataframe with a column
yhat containing the forecast. It has additional columns for
uncertainty intervals and seasonal components.
# R
forecast_sales_w <- predict(prophet_sales_w, future_sales_w)
tail(forecast_sales_w[c('ds', 'yhat', 'yhat_lower', 'yhat_upper')])
plot(prophet_sales_w, forecast_sales_w)
You can use the prophet_plot_components function to see
the forecast broken down into trend, weekly seasonality, and yearly
seasonality.
prophet_plot_components(prophet_sales_w, forecast_sales_w)
dyplot.prophet(prophet_sales_w, forecast_sales_w)
# Use the original dataset to get fitted values
fitted_values_w <- predict(prophet_sales_w, df_prophet_w)
# Extract the fitted values (column 'yhat' contains the fitted values)
fitted_y_w <- fitted_values_w$yhat
# Ensure alignment between actual values (y) and fitted values (yhat)
actual_y_w <- df_prophet_w$y # Actual weekly sales values
# Calculate RMSE for weekly data
rmse_prophet_w <- calculate_rmse(observed = actual_y_w, predicted = fitted_y_w)
# Print RMSE
cat("RMSE for Prophet Fitted Values (Weekly):", rmse_prophet_w, "\n")
## RMSE for Prophet Fitted Values (Weekly): 3246088
# Calculate Residuals
residuals_prophet_w <- actual_y_w - fitted_y_w # Residuals = Actual - Fitted
# Visualize Residuals using tsdisplay
tsdisplay(residuals_prophet_w, main = "Residual Diagnostics for Weekly Prophet Model")
# Perform ADF Test for Stationarity
adf_test_w <- adf.test(residuals_prophet_w)
cat("ADF Test p-value:", adf_test_w$p.value, "\n")
## ADF Test p-value: 0.0238759
if (adf_test_w$p.value < 0.05) {
cat("Residuals are stationary (reject H0).\n")
} else {
cat("Residuals are not stationary (fail to reject H0).\n")
}
## Residuals are stationary (reject H0).
# Perform Serial Correlation Test
ljung_box_test_w <- Box.test(residuals_prophet_w, lag = 10, type = "Ljung-Box")
cat("Ljung-Box Test p-value:", ljung_box_test_w$p.value, "\n")
## Ljung-Box Test p-value: 5.447864e-11
if (ljung_box_test_w$p.value > 0.05) {
cat("Residuals resemble white noise (no significant autocorrelation).\n")
} else {
cat("Residuals show significant autocorrelation.\n")
}
## Residuals show significant autocorrelation.
head(sales_d_ts)
## Time Series:
## Start = c(2021, 334)
## End = c(2021, 339)
## Frequency = 365
## [1] 1493701 673701 1205301 1340901 1343701 360901
plot(sales_d_ts)
sales_d_values <- as.numeric(sales_d_ts) # Extract numeric values
df_prophet_d <- data.frame(
ds = df_merged_d$date, # Dates
y = sales_d_values # Sales values
)
#Prophet model
#prophet_sales_d <- prophet(df_prophet, weekly.seasonality = TRUE)
prophet_sales_d <- prophet(df_prophet_d)
## Disabling daily seasonality. Run prophet with daily.seasonality=TRUE to override this.
future_sales_d <- make_future_dataframe(prophet_sales_d,
periods = 120,
freq = 'day',
include_history = T)
tail(future_sales_d)
forecast_sales_d <- predict(prophet_sales_d, future_sales_d)
tail(forecast_sales_d[c('ds', 'yhat', 'yhat_lower', 'yhat_upper')])
plot(prophet_sales_d, forecast_sales_d)
prophet_plot_components(prophet_sales_d, forecast_sales_d)
dyplot.prophet(prophet_sales_d, forecast_sales_d)
# Extract fitted values for RMSE calculation
fitted_values_d <- predict(prophet_sales_d, df_prophet_d)
# Extract fitted values (column 'yhat')
fitted_y_d <- fitted_values_d$yhat
actual_y_d <- df_prophet_d$y # Actual sales values
# Step 8: Calculate RMSE
rmse_prophet_d <- calculate_rmse(observed = actual_y_d, predicted = fitted_y_d)
# Print RMSE
cat("RMSE for Prophet Fitted Values (Daily):", rmse_prophet_d, "\n")
## RMSE for Prophet Fitted Values (Daily): 1071375
# Calculate Residuals
residuals_prophet_d <- actual_y_d - fitted_y_d # Residuals = Actual - Fitted
# Visualize Residuals using tsdisplay
tsdisplay(residuals_prophet_d, main = "Residual Diagnostics for Daily Prophet Model")
# Perform ADF Test for Stationarity
adf_test_d <- adf.test(residuals_prophet_d)
## Warning in adf.test(residuals_prophet_d): p-value smaller than printed p-value
cat("ADF Test p-value:", adf_test_d$p.value, "\n")
## ADF Test p-value: 0.01
if (adf_test_d$p.value < 0.05) {
cat("Residuals are stationary (reject H0).\n")
} else {
cat("Residuals are not stationary (fail to reject H0).\n")
}
## Residuals are stationary (reject H0).
# Perform Serial Correlation Test
ljung_box_test_d <- Box.test(residuals_prophet_d, lag = 20, type = "Ljung-Box")
cat("Ljung-Box Test p-value:", ljung_box_test_d$p.value, "\n")
## Ljung-Box Test p-value: 0
if (ljung_box_test_d$p.value > 0.05) {
cat("Residuals resemble white noise (no significant autocorrelation).\n")
} else {
cat("Residuals show significant autocorrelation.\n")
}
## Residuals show significant autocorrelation.
rmse_list <- c(rmse_ols_m, rmse_ols_w, rmse_ols_d,
rmse_bm_m, rmse_bm_w, rmse_bm_d,
rmse_ggm1, rmse_ggm_w, rmse_ggm_d,
rmse_hw2,
rmse_auto_arima, rmse_auto_arima_w, rmse_auto_arima_d,
rmse_sarima_d,
rmse_sarimax_d,
rmse_exp_sm_m, rmse_exp_sm_w, rmse_exp_sm_m,
rmse_mixture_m, rmse_mixture_w, rmse_mixture_d,
rmse_prophet_m, rmse_prophet_w, rmse_prophet_d
)
rmse_list
## [1] 13229295 4000370 1421301 18498870 4542019 1651896 11759505 3453199
## [9] 1600510 13169921 15118942 3328293 1094980 1094980 1146332 16189623
## [17] 3331847 16189623 20987648 4567633 1629664 16786939 3246088 1071375
# Initialize an empty data frame for RMSE values
rmse_table <- data.frame(
Model = character(),
Monthly = numeric(),
Weekly = numeric(),
Daily = numeric(),
stringsAsFactors = FALSE
)
# Monthly RMSE values
rmse_monthly <- c(
"OLS" = rmse_ols_m,
"Bass_Model" = rmse_bm_m,
"GGM" = rmse_ggm1,
"Holt_Winters" = rmse_hw2,
"Arima" = rmse_auto_arima,
"Exp_Smooth" = rmse_exp_sm_m,
"GGM+SARIMA" = rmse_mixture_m,
"Prophet" = rmse_prophet_m
)
# Weekly RMSE values
rmse_weekly <- c(
"OLS" = rmse_ols_w,
"Bass_Model" = rmse_bm_w,
"GGM" = rmse_ggm_w,
"Holt_Winters" = NaN,
"Arima" = rmse_auto_arima_w,
"Exp_Smooth" = rmse_exp_sm_w,
"GGM+SARIMA" = rmse_mixture_w,
"Prophet" = rmse_prophet_w
)
# Daily RMSE values
rmse_daily <- c(
"OLS" = rmse_ols_d,
"Bass_Model" = rmse_bm_d,
"GGM" = rmse_ggm_d,
"Holt_Winters" = NaN,
"Arima" = rmse_auto_arima_d,
"Exp_Smooth" = rmse_exp_sm_d,
"GGM+SARIMA" = rmse_mixture_d,
"Prophet" = rmse_prophet_d
)
# Combine RMSE values into a table
for (model_name in names(rmse_monthly)) {
rmse_table <- rbind(rmse_table, data.frame(
Model = model_name,
Monthly = rmse_monthly[model_name],
Weekly = rmse_weekly[model_name],
Daily = rmse_daily[model_name]
))
}
# View the RMSE table
print(rmse_table)
## Model Monthly Weekly Daily
## OLS OLS 13229295 4000370 1421301
## Bass_Model Bass_Model 18498870 4542019 1651896
## GGM GGM 11759505 3453199 1600510
## Holt_Winters Holt_Winters 13169921 NaN NaN
## Arima Arima 15118942 3328293 1094980
## Exp_Smooth Exp_Smooth 16189623 3331847 1642520
## GGM+SARIMA GGM+SARIMA 20987648 4567633 1629664
## Prophet Prophet 16786939 3246088 1071375
Best models are:
# target variable
test_sales_df <- read_excel("data/sales/test_data.xlsx")
head(test_sales_df)
df_sales_m_test <- test_sales_df %>%
mutate(month = floor_date(date, "month")) %>% # Extract month
group_by(month) %>%
summarise(sales_m = sum(sales_cop), bar_m = sum(bar), food_m = sum(food)
) # Summing values
head(df_sales_m_test)
## sales weekly
df_sales_w_test <- test_sales_df %>%
mutate(week = floor_date(date, "week")) %>% # Extract month
group_by(week) %>%
summarise(sales_w = sum(sales_cop), bar_w = sum(bar), food_w = sum(food)) # Summing values
head(df_sales_w_test)
# Length of historical data
N <- length(sales_m_ts)
# Time index for the next 38 periods
future_times <- matrix((N + 1):(N + 38), ncol = 1)
# Predict cumulative sales for the next 38 periods
ggm_future_cum <- predict(ggm1, newx = future_times)
# Convert cumulative sales forecast to instantaneous sales
ggm_future_inst <- make.instantaneous(ggm_future_cum)[-1]
# Get the end time of sales_m_ts
end_time <- tsp(sales_m_ts)[2] # End time of original series
# Start the forecast at the next period
forecast_start <- end_time + 1 / frequency(sales_m_ts)
# Create a time series object for the forecast
ggm_future_inst_ts <- ts(
ggm_future_inst,
start = forecast_start,
frequency = frequency(sales_m_ts)
)
# Step 5: Plot the forecast with aligned index
plot(
sales_m_ts,
type = "o",
col = "black",
lwd = 2,
main = "Original, Fitted, and Forecasted Instantaneous Sales",
xlab = "Time",
ylab = "Monthly Sales",
xlim = c(start(sales_m_ts)[1], forecast_start + 38 / frequency(sales_m_ts))
)
lines(
pred.inst_ggm_m,
col = "red",
lty = 2,
lwd = 2
)
lines(
ggm_future_inst_ts,
col = "blue",
lty = 2,
lwd = 2
)
legend(
"bottomright",
legend = c("Original Sales", "Fitted Values", "Forecast"),
col = c("black", "red", "blue"),
lty = c(1, 2, 2),
lwd = c(2, 2, 2)
)
The forecast of the GGM looks like the restaurant is loosing grip, actually restaurants can do well for way longer than the estimated period by the GGM. So we take this as a downside scenario forecast. But lets try to forecast a baseline scenario, with the second best model: the Holt-Winters.
# Step 1: Re-Fit the Holt-Winters model
hw2_m <- hw(sales_m_ts, seasonal = "multiplicative")
# Step 2: Generate the forecast for the next 24 months
forecast_hw <- forecast(hw2_m, h = 24)
# Step 3: Use autoplot to plot the original values and the forecast
autoplot(forecast_hw) +
ggtitle("Holt-Winters Forecast and Original Sales") +
xlab("Time") +
ylab("Monthly Sales") +
theme_minimal()
# Compare actual vs forecasted
actual_sales_m <- df_sales_m_test$sales_m
forecasted_hw <- forecast_hw$mean[1:2]
forecasted_ggm <- ggm_future_inst_ts[1:2]
rmse_test_hw <-calculate_rmse(actual_sales_m, forecasted_hw)
rmse_test_ggm <- calculate_rmse(actual_sales_m, forecasted_ggm)
cat("Error Holt-Winters:", rmse_test_hw, "\n")
## Error Holt-Winters: 33822273
cat("Error GGM:", rmse_test_ggm, "\n")
## Error GGM: 17357973
The GGM error is lower, we can do a combination (smoothing) of both scenarios for the final forecast
# Combine the actual and forecasted data into a data frame
plot_data <- data.frame(
Period = c("Last Period 1", "Last Period 2"),
Actual = actual_sales_m,
HoltWinters = forecasted_hw,
GGM = forecasted_ggm
)
plot_data_melt <- melt(plot_data, id.vars = "Period", variable.name = "Type", value.name = "Sales")
# Create a plot
ggplot(plot_data_melt, aes(x = Period, y = Sales, fill = Type)) +
geom_bar(stat = "identity", position = "dodge") +
labs(title = "Actual vs Forecasted Sales",
x = "Period",
y = "Sales",
fill = "Type") +
theme_minimal()
# Filter forecast for November and December 2024
forecast_nov_dec_w <- forecast_sales_w %>%
filter(ds >= as.Date("2024-11-01") & ds <= as.Date("2025-01-05"))
## Warning: There were 2 warnings in `filter()`.
## The first warning was:
## ℹ In argument: `ds >= as.Date("2024-11-01") & ds <= as.Date("2025-01-05")`.
## Caused by warning in `check_tzones()`:
## ! 'tzone' attributes are inconsistent
## ℹ Run `dplyr::last_dplyr_warnings()` to see the 1 remaining warning.
# Plot the actual sales with black dots
plot(as.Date(df_sales_w_test$week), df_sales_w_test$sales_w,
type = "p", # Points for actual sales
pch = 16, # Solid circles
col = "black", # Black color for points
xlab = "Date", # X-axis label
ylab = "Sales", # Y-axis label
main = "Actual Sales vs Forecasted Sales (Nov-Dec 2024)") # Title
# Add the forecast line in red
lines(as.Date(forecast_nov_dec_w$ds), forecast_nov_dec_w$yhat,
col = "red", # Red color for the line
lwd = 2) # Wider line
# Add a legend in the top right
legend("topright",
legend = c("Actual Sales", "Forecasted Sales"),
col = c("black", "red"), # Colors matching the plot
pch = c(16, NA), # Solid dot for actual sales, none for line
lty = c(NA, 1), # Line for forecasted sales, none for dot
lwd = c(NA, 2)) # Line width for the forecast
# Filter forecast for November and December 2024
forecast_nov_dec_d <- forecast_sales_d %>%
filter(ds >= as.Date("2024-11-01") & ds <= as.Date("2025-01-05"))
## Warning: There were 2 warnings in `filter()`.
## The first warning was:
## ℹ In argument: `ds >= as.Date("2024-11-01") & ds <= as.Date("2025-01-05")`.
## Caused by warning in `check_tzones()`:
## ! 'tzone' attributes are inconsistent
## ℹ Run `dplyr::last_dplyr_warnings()` to see the 1 remaining warning.
# Plot the actual sales with black dots
plot(as.Date(test_sales_df$date), test_sales_df$sales_cop,
type = "p", # Points for actual sales
pch = 16, # Solid circles
col = "black", # Black color for points
xlab = "Date", # X-axis label
ylab = "Sales", # Y-axis label
main = "Actual Sales vs Forecasted Sales (Nov-Dec 2024)") # Title
# Add the forecast line in red
lines(as.Date(forecast_nov_dec_d$ds), forecast_nov_dec_d$yhat,
col = "red", # Red color for the line
lwd = 2) # Wider line
# Add a legend in the top right
legend("topright",
legend = c("Actual Sales", "Forecasted Sales"),
col = c("black", "red"), # Colors matching the plot
pch = c(16, NA), # Solid dot for actual sales, none for line
lty = c(NA, 1), # Line for forecasted sales, none for dot
lwd = c(NA, 2)) # Line width for the forecast
Finally, we want to use the long term forecast, so we can use the monthly combination of models for that. ### Montly
long_term_forecasted_hw <- forecast_hw$mean
long_term_forecasted_ggm <- ggm_future_inst_ts
# Define the date ranges for both series
dates_hw <- seq(as.Date("2024-11-01"), as.Date("2026-10-01"), by = "month")
dates_ggm <- seq(as.Date("2024-11-01"), as.Date("2027-11-01"), by = "month")
# Create data frames for the two series
df_hw <- data.frame(Date = dates_hw, Value = long_term_forecasted_hw)
df_ggm <- data.frame(Date = dates_ggm, Value = long_term_forecasted_ggm)
# Merge the two series on the common dates
df_merged <- merge(df_hw, df_ggm, by = "Date", suffixes = c("_hw", "_ggm"))
# Calculate the mean of both series where data is available
df_merged$Mean <- rowMeans(df_merged[, c("Value_hw", "Value_ggm")], na.rm = TRUE)
# Create the new time series
long_term_forecast <- data.frame(Date = df_merged$Date, Mean = df_merged$Mean)
plot(long_term_forecast)